aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorJames Morris <jmorris@namei.org>2009-02-05 19:01:45 -0500
committerJames Morris <jmorris@namei.org>2009-02-05 19:01:45 -0500
commitcb5629b10d64a8006622ce3a52bc887d91057d69 (patch)
tree7c06d8f30783115e3384721046258ce615b129c5 /fs
parent8920d5ad6ba74ae8ab020e90cc4d976980e68701 (diff)
parentf01d1d546abb2f4028b5299092f529eefb01253a (diff)
Merge branch 'master' into next
Conflicts: fs/namei.c Manually merged per: diff --cc fs/namei.c index 734f2b5,bbc15c2..0000000 --- a/fs/namei.c +++ b/fs/namei.c @@@ -860,9 -848,8 +849,10 @@@ static int __link_path_walk(const char nd->flags |= LOOKUP_CONTINUE; err = exec_permission_lite(inode); if (err == -EAGAIN) - err = vfs_permission(nd, MAY_EXEC); + err = inode_permission(nd->path.dentry->d_inode, + MAY_EXEC); + if (!err) + err = ima_path_check(&nd->path, MAY_EXEC); if (err) break; @@@ -1525,14 -1506,9 +1509,14 @@@ int may_open(struct path *path, int acc flag &= ~O_TRUNC; } - error = vfs_permission(nd, acc_mode); + error = inode_permission(inode, acc_mode); if (error) return error; + - error = ima_path_check(&nd->path, ++ error = ima_path_check(path, + acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC)); + if (error) + return error; /* * An append-only file must be opened in append mode for writing. */ Signed-off-by: James Morris <jmorris@namei.org>
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/Kconfig10
-rw-r--r--fs/Kconfig1386
-rw-r--r--fs/Kconfig.binfmt2
-rw-r--r--fs/Makefile8
-rw-r--r--fs/adfs/Kconfig27
-rw-r--r--fs/affs/Kconfig21
-rw-r--r--fs/affs/file.c2
-rw-r--r--fs/affs/inode.c3
-rw-r--r--fs/afs/Kconfig21
-rw-r--r--fs/afs/write.c2
-rw-r--r--fs/aio.c22
-rw-r--r--fs/anon_inodes.c7
-rw-r--r--fs/autofs/Kconfig21
-rw-r--r--fs/autofs/inode.c2
-rw-r--r--fs/autofs4/Kconfig20
-rw-r--r--fs/autofs4/autofs_i.h2
-rw-r--r--fs/autofs4/dev-ioctl.c75
-rw-r--r--fs/autofs4/expire.c4
-rw-r--r--fs/autofs4/inode.c18
-rw-r--r--fs/autofs4/waitq.c8
-rw-r--r--fs/bad_inode.c6
-rw-r--r--fs/befs/Kconfig26
-rw-r--r--fs/befs/linuxvfs.c5
-rw-r--r--fs/bfs/Kconfig19
-rw-r--r--fs/bfs/inode.c45
-rw-r--r--fs/binfmt_aout.c81
-rw-r--r--fs/binfmt_elf.c12
-rw-r--r--fs/binfmt_elf_fdpic.c35
-rw-r--r--fs/binfmt_flat.c34
-rw-r--r--fs/binfmt_misc.c5
-rw-r--r--fs/bio-integrity.c26
-rw-r--r--fs/bio.c36
-rw-r--r--fs/block_dev.c42
-rw-r--r--fs/btrfs/Kconfig18
-rw-r--r--fs/btrfs/Makefile25
-rw-r--r--fs/btrfs/acl.c351
-rw-r--r--fs/btrfs/async-thread.c419
-rw-r--r--fs/btrfs/async-thread.h101
-rw-r--r--fs/btrfs/btrfs_inode.h131
-rw-r--r--fs/btrfs/compat.h7
-rw-r--r--fs/btrfs/compression.c709
-rw-r--r--fs/btrfs/compression.h47
-rw-r--r--fs/btrfs/crc32c.h29
-rw-r--r--fs/btrfs/ctree.c3953
-rw-r--r--fs/btrfs/ctree.h2129
-rw-r--r--fs/btrfs/dir-item.c386
-rw-r--r--fs/btrfs/disk-io.c2343
-rw-r--r--fs/btrfs/disk-io.h102
-rw-r--r--fs/btrfs/export.c203
-rw-r--r--fs/btrfs/export.h19
-rw-r--r--fs/btrfs/extent-tree.c5986
-rw-r--r--fs/btrfs/extent_io.c3717
-rw-r--r--fs/btrfs/extent_io.h269
-rw-r--r--fs/btrfs/extent_map.c351
-rw-r--r--fs/btrfs/extent_map.h62
-rw-r--r--fs/btrfs/file-item.c831
-rw-r--r--fs/btrfs/file.c1288
-rw-r--r--fs/btrfs/free-space-cache.c495
-rw-r--r--fs/btrfs/hash.h27
-rw-r--r--fs/btrfs/inode-item.c206
-rw-r--r--fs/btrfs/inode-map.c144
-rw-r--r--fs/btrfs/inode.c5035
-rw-r--r--fs/btrfs/ioctl.c1132
-rw-r--r--fs/btrfs/ioctl.h69
-rw-r--r--fs/btrfs/locking.c88
-rw-r--r--fs/btrfs/locking.h27
-rw-r--r--fs/btrfs/ordered-data.c730
-rw-r--r--fs/btrfs/ordered-data.h158
-rw-r--r--fs/btrfs/orphan.c67
-rw-r--r--fs/btrfs/print-tree.c216
-rw-r--r--fs/btrfs/print-tree.h23
-rw-r--r--fs/btrfs/ref-cache.c230
-rw-r--r--fs/btrfs/ref-cache.h77
-rw-r--r--fs/btrfs/root-tree.c366
-rw-r--r--fs/btrfs/struct-funcs.c139
-rw-r--r--fs/btrfs/super.c723
-rw-r--r--fs/btrfs/sysfs.c269
-rw-r--r--fs/btrfs/transaction.c1097
-rw-r--r--fs/btrfs/transaction.h106
-rw-r--r--fs/btrfs/tree-defrag.c147
-rw-r--r--fs/btrfs/tree-log.c2898
-rw-r--r--fs/btrfs/tree-log.h41
-rw-r--r--fs/btrfs/version.h4
-rw-r--r--fs/btrfs/version.sh43
-rw-r--r--fs/btrfs/volumes.c3219
-rw-r--r--fs/btrfs/volumes.h162
-rw-r--r--fs/btrfs/xattr.c322
-rw-r--r--fs/btrfs/xattr.h39
-rw-r--r--fs/btrfs/zlib.c632
-rw-r--r--fs/buffer.c81
-rw-r--r--fs/char_dev.c2
-rw-r--r--fs/cifs/CHANGES4
-rw-r--r--fs/cifs/Makefile2
-rw-r--r--fs/cifs/cifsencrypt.c18
-rw-r--r--fs/cifs/cifsfs.c7
-rw-r--r--fs/cifs/cifsfs.h1
-rw-r--r--fs/cifs/cifsproto.h4
-rw-r--r--fs/cifs/connect.c24
-rw-r--r--fs/cifs/dir.c56
-rw-r--r--fs/cifs/fcntl.c118
-rw-r--r--fs/cifs/file.c2
-rw-r--r--fs/cifs/inode.c7
-rw-r--r--fs/cifs/md5.c38
-rw-r--r--fs/cifs/md5.h6
-rw-r--r--fs/cifs/transport.c127
-rw-r--r--fs/coda/Kconfig21
-rw-r--r--fs/coda/file.c12
-rw-r--r--fs/coda/sysctl.c5
-rw-r--r--fs/compat.c12
-rw-r--r--fs/compat_ioctl.c7
-rw-r--r--fs/configfs/Kconfig11
-rw-r--r--fs/configfs/dir.c59
-rw-r--r--fs/configfs/inode.c3
-rw-r--r--fs/cramfs/Kconfig19
-rw-r--r--fs/cramfs/inode.c2
-rw-r--r--fs/dcache.c39
-rw-r--r--fs/dcookies.c38
-rw-r--r--fs/debugfs/file.c32
-rw-r--r--fs/debugfs/inode.c3
-rw-r--r--fs/devpts/inode.c472
-rw-r--r--fs/direct-io.c13
-rw-r--r--fs/dlm/ast.c56
-rw-r--r--fs/dlm/ast.h4
-rw-r--r--fs/dlm/debug_fs.c700
-rw-r--r--fs/dlm/dir.c18
-rw-r--r--fs/dlm/dlm_internal.h6
-rw-r--r--fs/dlm/lock.c57
-rw-r--r--fs/dlm/lockspace.c2
-rw-r--r--fs/dlm/lowcomms.c8
-rw-r--r--fs/dlm/memory.c6
-rw-r--r--fs/dlm/midcomms.c2
-rw-r--r--fs/dlm/netlink.c1
-rw-r--r--fs/dlm/plock.c6
-rw-r--r--fs/dlm/recover.c10
-rw-r--r--fs/dlm/user.c4
-rw-r--r--fs/dlm/user.h2
-rw-r--r--fs/dquot.c594
-rw-r--r--fs/ecryptfs/Kconfig11
-rw-r--r--fs/ecryptfs/crypto.c514
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h105
-rw-r--r--fs/ecryptfs/file.c45
-rw-r--r--fs/ecryptfs/inode.c303
-rw-r--r--fs/ecryptfs/keystore.c651
-rw-r--r--fs/ecryptfs/main.c126
-rw-r--r--fs/ecryptfs/messaging.c4
-rw-r--r--fs/ecryptfs/miscdev.c18
-rw-r--r--fs/ecryptfs/mmap.c2
-rw-r--r--fs/efs/Kconfig14
-rw-r--r--fs/eventfd.c5
-rw-r--r--fs/eventpoll.c40
-rw-r--r--fs/exec.c80
-rw-r--r--fs/ext2/dir.c7
-rw-r--r--fs/ext2/ialloc.c14
-rw-r--r--fs/ext2/inode.c9
-rw-r--r--fs/ext2/ioctl.c3
-rw-r--r--fs/ext2/namei.c15
-rw-r--r--fs/ext2/super.c10
-rw-r--r--fs/ext3/hash.c77
-rw-r--r--fs/ext3/ialloc.c14
-rw-r--r--fs/ext3/inode.c9
-rw-r--r--fs/ext3/ioctl.c3
-rw-r--r--fs/ext3/namei.c53
-rw-r--r--fs/ext3/super.c104
-rw-r--r--fs/ext4/balloc.c297
-rw-r--r--fs/ext4/bitmap.c5
-rw-r--r--fs/ext4/dir.c10
-rw-r--r--fs/ext4/ext4.h165
-rw-r--r--fs/ext4/ext4_extents.h5
-rw-r--r--fs/ext4/ext4_i.h16
-rw-r--r--fs/ext4/ext4_jbd2.c83
-rw-r--r--fs/ext4/ext4_jbd2.h87
-rw-r--r--fs/ext4/ext4_sb.h12
-rw-r--r--fs/ext4/extents.c64
-rw-r--r--fs/ext4/file.c3
-rw-r--r--fs/ext4/hash.c77
-rw-r--r--fs/ext4/ialloc.c330
-rw-r--r--fs/ext4/inode.c325
-rw-r--r--fs/ext4/ioctl.c2
-rw-r--r--fs/ext4/mballoc.c631
-rw-r--r--fs/ext4/mballoc.h71
-rw-r--r--fs/ext4/migrate.c19
-rw-r--r--fs/ext4/namei.c134
-rw-r--r--fs/ext4/resize.c114
-rw-r--r--fs/ext4/super.c678
-rw-r--r--fs/ext4/xattr.c25
-rw-r--r--fs/fat/Kconfig97
-rw-r--r--fs/fat/dir.c1
-rw-r--r--fs/fat/inode.c2
-rw-r--r--fs/fat/namei_vfat.c2
-rw-r--r--fs/fcntl.c11
-rw-r--r--fs/file_table.c10
-rw-r--r--fs/filesystems.c25
-rw-r--r--fs/freevxfs/Kconfig16
-rw-r--r--fs/freevxfs/vxfs_inode.c4
-rw-r--r--fs/fs-writeback.c92
-rw-r--r--fs/fuse/Kconfig15
-rw-r--r--fs/fuse/control.c6
-rw-r--r--fs/fuse/dev.c121
-rw-r--r--fs/fuse/dir.c48
-rw-r--r--fs/fuse/file.c463
-rw-r--r--fs/fuse/fuse_i.h83
-rw-r--r--fs/fuse/inode.c185
-rw-r--r--fs/gfs2/Kconfig2
-rw-r--r--fs/gfs2/Makefile2
-rw-r--r--fs/gfs2/acl.c2
-rw-r--r--fs/gfs2/bmap.c77
-rw-r--r--fs/gfs2/bmap.h34
-rw-r--r--fs/gfs2/daemon.c136
-rw-r--r--fs/gfs2/daemon.h17
-rw-r--r--fs/gfs2/dir.c62
-rw-r--r--fs/gfs2/dir.h1
-rw-r--r--fs/gfs2/eattr.c40
-rw-r--r--fs/gfs2/glock.c303
-rw-r--r--fs/gfs2/glock.h2
-rw-r--r--fs/gfs2/glops.c56
-rw-r--r--fs/gfs2/incore.h55
-rw-r--r--fs/gfs2/inode.c53
-rw-r--r--fs/gfs2/inode.h13
-rw-r--r--fs/gfs2/locking/dlm/mount.c12
-rw-r--r--fs/gfs2/locking/dlm/sysfs.c16
-rw-r--r--fs/gfs2/main.c15
-rw-r--r--fs/gfs2/mount.c29
-rw-r--r--fs/gfs2/ops_address.c35
-rw-r--r--fs/gfs2/ops_dentry.c2
-rw-r--r--fs/gfs2/ops_dentry.h17
-rw-r--r--fs/gfs2/ops_export.c5
-rw-r--r--fs/gfs2/ops_file.c24
-rw-r--r--fs/gfs2/ops_fstype.c125
-rw-r--r--fs/gfs2/ops_fstype.h19
-rw-r--r--fs/gfs2/ops_inode.c75
-rw-r--r--fs/gfs2/ops_inode.h25
-rw-r--r--fs/gfs2/ops_super.c165
-rw-r--r--fs/gfs2/ops_super.h17
-rw-r--r--fs/gfs2/quota.c113
-rw-r--r--fs/gfs2/quota.h24
-rw-r--r--fs/gfs2/recovery.c48
-rw-r--r--fs/gfs2/recovery.h14
-rw-r--r--fs/gfs2/rgrp.c58
-rw-r--r--fs/gfs2/super.c246
-rw-r--r--fs/gfs2/super.h13
-rw-r--r--fs/gfs2/sys.c66
-rw-r--r--fs/gfs2/sys.h4
-rw-r--r--fs/gfs2/util.c1
-rw-r--r--fs/gfs2/util.h1
-rw-r--r--fs/hfs/Kconfig12
-rw-r--r--fs/hfsplus/Kconfig13
-rw-r--r--fs/hostfs/hostfs_kern.c2
-rw-r--r--fs/hpfs/Kconfig14
-rw-r--r--fs/hugetlbfs/inode.c13
-rw-r--r--fs/inode.c74
-rw-r--r--fs/ioctl.c92
-rw-r--r--fs/ioprio.c8
-rw-r--r--fs/isofs/Kconfig39
-rw-r--r--fs/isofs/inode.c6
-rw-r--r--fs/jbd/commit.c15
-rw-r--r--fs/jbd/transaction.c39
-rw-r--r--fs/jbd2/checkpoint.c24
-rw-r--r--fs/jbd2/commit.c67
-rw-r--r--fs/jbd2/journal.c143
-rw-r--r--fs/jbd2/transaction.c107
-rw-r--r--fs/jffs2/compr_rubin.c120
-rw-r--r--fs/jffs2/erase.c5
-rw-r--r--fs/jffs2/file.c2
-rw-r--r--fs/jffs2/nodelist.h3
-rw-r--r--fs/jfs/Kconfig49
-rw-r--r--fs/jfs/jfs_imap.c10
-rw-r--r--fs/jfs/jfs_inode.c29
-rw-r--r--fs/jfs/namei.c24
-rw-r--r--fs/jfs/super.c10
-rw-r--r--fs/libfs.c7
-rw-r--r--fs/lockd/clntproc.c7
-rw-r--r--fs/lockd/host.c170
-rw-r--r--fs/lockd/mon.c569
-rw-r--r--fs/lockd/svc.c72
-rw-r--r--fs/lockd/svc4proc.c13
-rw-r--r--fs/lockd/svcproc.c13
-rw-r--r--fs/lockd/svcsubs.c1
-rw-r--r--fs/lockd/xdr.c5
-rw-r--r--fs/lockd/xdr4.c5
-rw-r--r--fs/locks.c2
-rw-r--r--fs/minix/Kconfig17
-rw-r--r--fs/minix/dir.c2
-rw-r--r--fs/mpage.c6
-rw-r--r--fs/namei.c197
-rw-r--r--fs/namespace.c15
-rw-r--r--fs/ncpfs/Kconfig21
-rw-r--r--fs/ncpfs/getopt.c1
-rw-r--r--fs/ncpfs/ioctl.c2
-rw-r--r--fs/nfs/Kconfig86
-rw-r--r--fs/nfs/file.c2
-rw-r--r--fs/nfsctl.c9
-rw-r--r--fs/nfsd/Kconfig80
-rw-r--r--fs/nfsd/auth.c7
-rw-r--r--fs/nfsd/nfs4callback.c3
-rw-r--r--fs/nfsd/nfs4proc.c5
-rw-r--r--fs/nfsd/nfs4recover.c2
-rw-r--r--fs/nfsd/nfs4state.c80
-rw-r--r--fs/nfsd/nfs4xdr.c2
-rw-r--r--fs/nfsd/nfsctl.c479
-rw-r--r--fs/nfsd/nfsfh.c36
-rw-r--r--fs/nfsd/nfsproc.c1
-rw-r--r--fs/nfsd/vfs.c9
-rw-r--r--fs/notify/Kconfig2
-rw-r--r--fs/notify/Makefile2
-rw-r--r--fs/notify/dnotify/Kconfig10
-rw-r--r--fs/notify/dnotify/Makefile1
-rw-r--r--fs/notify/dnotify/dnotify.c (renamed from fs/dnotify.c)3
-rw-r--r--fs/notify/inotify/Kconfig27
-rw-r--r--fs/notify/inotify/Makefile2
-rw-r--r--fs/notify/inotify/inotify.c (renamed from fs/inotify.c)0
-rw-r--r--fs/notify/inotify/inotify_user.c (renamed from fs/inotify_user.c)146
-rw-r--r--fs/ntfs/Kconfig78
-rw-r--r--fs/ntfs/inode.c3
-rw-r--r--fs/ocfs2/Kconfig85
-rw-r--r--fs/ocfs2/Makefile7
-rw-r--r--fs/ocfs2/acl.c479
-rw-r--r--fs/ocfs2/acl.h58
-rw-r--r--fs/ocfs2/alloc.c715
-rw-r--r--fs/ocfs2/alloc.h30
-rw-r--r--fs/ocfs2/aops.c59
-rw-r--r--fs/ocfs2/blockcheck.c477
-rw-r--r--fs/ocfs2/blockcheck.h82
-rw-r--r--fs/ocfs2/buffer_head_io.c32
-rw-r--r--fs/ocfs2/buffer_head_io.h27
-rw-r--r--fs/ocfs2/cluster/heartbeat.c2
-rw-r--r--fs/ocfs2/cluster/masklog.c1
-rw-r--r--fs/ocfs2/cluster/masklog.h1
-rw-r--r--fs/ocfs2/dcache.c42
-rw-r--r--fs/ocfs2/dcache.h9
-rw-r--r--fs/ocfs2/dir.c399
-rw-r--r--fs/ocfs2/dir.h2
-rw-r--r--fs/ocfs2/dlm/dlmast.c52
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h3
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c53
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c1
-rw-r--r--fs/ocfs2/dlm/dlmfs.c2
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c42
-rw-r--r--fs/ocfs2/dlm/dlmthread.c3
-rw-r--r--fs/ocfs2/dlmglue.c176
-rw-r--r--fs/ocfs2/dlmglue.h19
-rw-r--r--fs/ocfs2/extent_map.c96
-rw-r--r--fs/ocfs2/extent_map.h24
-rw-r--r--fs/ocfs2/file.c211
-rw-r--r--fs/ocfs2/file.h3
-rw-r--r--fs/ocfs2/inode.c175
-rw-r--r--fs/ocfs2/inode.h18
-rw-r--r--fs/ocfs2/journal.c364
-rw-r--r--fs/ocfs2/journal.h128
-rw-r--r--fs/ocfs2/localalloc.c26
-rw-r--r--fs/ocfs2/namei.c318
-rw-r--r--fs/ocfs2/ocfs2.h52
-rw-r--r--fs/ocfs2/ocfs2_fs.h213
-rw-r--r--fs/ocfs2/ocfs2_jbd_compat.h82
-rw-r--r--fs/ocfs2/ocfs2_lockid.h5
-rw-r--r--fs/ocfs2/quota.h119
-rw-r--r--fs/ocfs2/quota_global.c862
-rw-r--r--fs/ocfs2/quota_local.c1253
-rw-r--r--fs/ocfs2/resize.c76
-rw-r--r--fs/ocfs2/slot_map.c4
-rw-r--r--fs/ocfs2/suballoc.c363
-rw-r--r--fs/ocfs2/suballoc.h18
-rw-r--r--fs/ocfs2/super.c331
-rw-r--r--fs/ocfs2/symlink.c2
-rw-r--r--fs/ocfs2/xattr.c2979
-rw-r--r--fs/ocfs2/xattr.h45
-rw-r--r--fs/omfs/Kconfig13
-rw-r--r--fs/omfs/inode.c1
-rw-r--r--fs/open.c89
-rw-r--r--fs/openpromfs/inode.c3
-rw-r--r--fs/partitions/check.c12
-rw-r--r--fs/pipe.c11
-rw-r--r--fs/proc/base.c235
-rw-r--r--fs/proc/generic.c8
-rw-r--r--fs/proc/inode.c3
-rw-r--r--fs/proc/internal.h2
-rw-r--r--fs/proc/meminfo.c6
-rw-r--r--fs/proc/nommu.c71
-rw-r--r--fs/proc/proc_net.c2
-rw-r--r--fs/proc/proc_sysctl.c1
-rw-r--r--fs/proc/root.c8
-rw-r--r--fs/proc/stat.c11
-rw-r--r--fs/proc/task_mmu.c8
-rw-r--r--fs/proc/task_nommu.c122
-rw-r--r--fs/proc/vmcore.c2
-rw-r--r--fs/qnx4/Kconfig25
-rw-r--r--fs/quota.c14
-rw-r--r--fs/quota_tree.c645
-rw-r--r--fs/quota_tree.h25
-rw-r--r--fs/quota_v1.c28
-rw-r--r--fs/quota_v2.c631
-rw-r--r--fs/quotaio_v1.h33
-rw-r--r--fs/quotaio_v2.h60
-rw-r--r--fs/ramfs/file-nommu.c21
-rw-r--r--fs/ramfs/inode.c1
-rw-r--r--fs/read_write.c62
-rw-r--r--fs/readdir.c9
-rw-r--r--fs/reiserfs/Kconfig85
-rw-r--r--fs/reiserfs/inode.c30
-rw-r--r--fs/reiserfs/namei.c8
-rw-r--r--fs/reiserfs/super.c20
-rw-r--r--fs/romfs/Kconfig16
-rw-r--r--fs/romfs/inode.c13
-rw-r--r--fs/select.c105
-rw-r--r--fs/seq_file.c128
-rw-r--r--fs/signalfd.c8
-rw-r--r--fs/smbfs/Kconfig55
-rw-r--r--fs/smbfs/file.c2
-rw-r--r--fs/splice.c13
-rw-r--r--fs/squashfs/Kconfig51
-rw-r--r--fs/squashfs/Makefile8
-rw-r--r--fs/squashfs/block.c274
-rw-r--r--fs/squashfs/cache.c412
-rw-r--r--fs/squashfs/dir.c235
-rw-r--r--fs/squashfs/export.c155
-rw-r--r--fs/squashfs/file.c502
-rw-r--r--fs/squashfs/fragment.c98
-rw-r--r--fs/squashfs/id.c94
-rw-r--r--fs/squashfs/inode.c346
-rw-r--r--fs/squashfs/namei.c242
-rw-r--r--fs/squashfs/squashfs.h90
-rw-r--r--fs/squashfs/squashfs_fs.h380
-rw-r--r--fs/squashfs/squashfs_fs_i.h45
-rw-r--r--fs/squashfs/squashfs_fs_sb.h76
-rw-r--r--fs/squashfs/super.c441
-rw-r--r--fs/squashfs/symlink.c118
-rw-r--r--fs/stat.c40
-rw-r--r--fs/super.c14
-rw-r--r--fs/sync.c82
-rw-r--r--fs/sysfs/Kconfig23
-rw-r--r--fs/sysfs/bin.c6
-rw-r--r--fs/sysfs/inode.c3
-rw-r--r--fs/sysv/Kconfig36
-rw-r--r--fs/sysv/inode.c6
-rw-r--r--fs/timerfd.c10
-rw-r--r--fs/ubifs/Kconfig2
-rw-r--r--fs/ubifs/budget.c243
-rw-r--r--fs/ubifs/commit.c25
-rw-r--r--fs/ubifs/compress.c18
-rw-r--r--fs/ubifs/debug.c327
-rw-r--r--fs/ubifs/debug.h127
-rw-r--r--fs/ubifs/dir.c96
-rw-r--r--fs/ubifs/file.c26
-rw-r--r--fs/ubifs/gc.c30
-rw-r--r--fs/ubifs/io.c22
-rw-r--r--fs/ubifs/ioctl.c2
-rw-r--r--fs/ubifs/journal.c8
-rw-r--r--fs/ubifs/key.h32
-rw-r--r--fs/ubifs/lprops.c26
-rw-r--r--fs/ubifs/lpt.c45
-rw-r--r--fs/ubifs/lpt_commit.c254
-rw-r--r--fs/ubifs/master.c2
-rw-r--r--fs/ubifs/orphan.c40
-rw-r--r--fs/ubifs/replay.c15
-rw-r--r--fs/ubifs/sb.c20
-rw-r--r--fs/ubifs/shrinker.c2
-rw-r--r--fs/ubifs/super.c383
-rw-r--r--fs/ubifs/tnc.c43
-rw-r--r--fs/ubifs/tnc_commit.c9
-rw-r--r--fs/ubifs/ubifs-media.h7
-rw-r--r--fs/ubifs/ubifs.h137
-rw-r--r--fs/udf/Kconfig18
-rw-r--r--fs/ufs/Kconfig43
-rw-r--r--fs/utimes.c11
-rw-r--r--fs/xattr.c55
-rw-r--r--fs/xfs/Kconfig1
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.h2
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c79
-rw-r--r--fs/xfs/linux-2.6/xfs_export.c23
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.c320
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.h15
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl32.c186
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c27
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c6
-rw-r--r--fs/xfs/quota/xfs_dquot.c38
-rw-r--r--fs/xfs/quota/xfs_dquot.h10
-rw-r--r--fs/xfs/quota/xfs_qm.c9
-rw-r--r--fs/xfs/xfs_acl.h1
-rw-r--r--fs/xfs/xfs_ag.h8
-rw-r--r--fs/xfs/xfs_alloc_btree.c2
-rw-r--r--fs/xfs/xfs_attr.c26
-rw-r--r--fs/xfs/xfs_attr_leaf.c72
-rw-r--r--fs/xfs/xfs_attr_leaf.h12
-rw-r--r--fs/xfs/xfs_bit.h10
-rw-r--r--fs/xfs/xfs_bmap.c166
-rw-r--r--fs/xfs/xfs_bmap.h2
-rw-r--r--fs/xfs/xfs_bmap_btree.c94
-rw-r--r--fs/xfs/xfs_bmap_btree.h4
-rw-r--r--fs/xfs/xfs_btree.c20
-rw-r--r--fs/xfs/xfs_da_btree.c8
-rw-r--r--fs/xfs/xfs_dfrag.c10
-rw-r--r--fs/xfs/xfs_dir2_block.c7
-rw-r--r--fs/xfs/xfs_dir2_leaf.c6
-rw-r--r--fs/xfs/xfs_dir2_sf.c15
-rw-r--r--fs/xfs/xfs_fs.h4
-rw-r--r--fs/xfs/xfs_fsops.c11
-rw-r--r--fs/xfs/xfs_fsops.h2
-rw-r--r--fs/xfs/xfs_ialloc.c6
-rw-r--r--fs/xfs/xfs_ialloc.h2
-rw-r--r--fs/xfs/xfs_ialloc_btree.h1
-rw-r--r--fs/xfs/xfs_inode.c19
-rw-r--r--fs/xfs/xfs_inode_item.h4
-rw-r--r--fs/xfs/xfs_iomap.c10
-rw-r--r--fs/xfs/xfs_itable.c6
-rw-r--r--fs/xfs/xfs_log_recover.c31
-rw-r--r--fs/xfs/xfs_mount.c26
-rw-r--r--fs/xfs/xfs_mount.h9
-rw-r--r--fs/xfs/xfs_rename.c2
-rw-r--r--fs/xfs/xfs_rtalloc.c2
-rw-r--r--fs/xfs/xfs_rw.h1
-rw-r--r--fs/xfs/xfs_sb.h2
-rw-r--r--fs/xfs/xfs_types.h4
-rw-r--r--fs/xfs/xfs_vnodeops.c20
512 files changed, 68059 insertions, 11282 deletions
diff --git a/fs/9p/Kconfig b/fs/9p/Kconfig
new file mode 100644
index 000000000000..74e0723e90bc
--- /dev/null
+++ b/fs/9p/Kconfig
@@ -0,0 +1,10 @@
1config 9P_FS
2 tristate "Plan 9 Resource Sharing Support (9P2000) (Experimental)"
3 depends on INET && NET_9P && EXPERIMENTAL
4 help
5 If you say Y here, you will get experimental support for
6 Plan 9 resource sharing via the 9P2000 protocol.
7
8 See <http://v9fs.sf.net> for more information.
9
10 If unsure, say N.
diff --git a/fs/Kconfig b/fs/Kconfig
index 522469a7eca3..93945dd0b1ae 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -27,141 +27,8 @@ config FS_MBCACHE
27 default y if EXT4_FS=y && EXT4_FS_XATTR 27 default y if EXT4_FS=y && EXT4_FS_XATTR
28 default m if EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS_XATTR 28 default m if EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS_XATTR
29 29
30config REISERFS_FS 30source "fs/reiserfs/Kconfig"
31 tristate "Reiserfs support" 31source "fs/jfs/Kconfig"
32 help
33 Stores not just filenames but the files themselves in a balanced
34 tree. Uses journalling.
35
36 Balanced trees are more efficient than traditional file system
37 architectural foundations.
38
39 In general, ReiserFS is as fast as ext2, but is very efficient with
40 large directories and small files. Additional patches are needed
41 for NFS and quotas, please see <http://www.namesys.com/> for links.
42
43 It is more easily extended to have features currently found in
44 database and keyword search systems than block allocation based file
45 systems are. The next version will be so extended, and will support
46 plugins consistent with our motto ``It takes more than a license to
47 make source code open.''
48
49 Read <http://www.namesys.com/> to learn more about reiserfs.
50
51 Sponsored by Threshold Networks, Emusic.com, and Bigstorage.com.
52
53 If you like it, you can pay us to add new features to it that you
54 need, buy a support contract, or pay us to port it to another OS.
55
56config REISERFS_CHECK
57 bool "Enable reiserfs debug mode"
58 depends on REISERFS_FS
59 help
60 If you set this to Y, then ReiserFS will perform every check it can
61 possibly imagine of its internal consistency throughout its
62 operation. It will also go substantially slower. More than once we
63 have forgotten that this was on, and then gone despondent over the
64 latest benchmarks.:-) Use of this option allows our team to go all
65 out in checking for consistency when debugging without fear of its
66 effect on end users. If you are on the verge of sending in a bug
67 report, say Y and you might get a useful error message. Almost
68 everyone should say N.
69
70config REISERFS_PROC_INFO
71 bool "Stats in /proc/fs/reiserfs"
72 depends on REISERFS_FS && PROC_FS
73 help
74 Create under /proc/fs/reiserfs a hierarchy of files, displaying
75 various ReiserFS statistics and internal data at the expense of
76 making your kernel or module slightly larger (+8 KB). This also
77 increases the amount of kernel memory required for each mount.
78 Almost everyone but ReiserFS developers and people fine-tuning
79 reiserfs or tracing problems should say N.
80
81config REISERFS_FS_XATTR
82 bool "ReiserFS extended attributes"
83 depends on REISERFS_FS
84 help
85 Extended attributes are name:value pairs associated with inodes by
86 the kernel or by users (see the attr(5) manual page, or visit
87 <http://acl.bestbits.at/> for details).
88
89 If unsure, say N.
90
91config REISERFS_FS_POSIX_ACL
92 bool "ReiserFS POSIX Access Control Lists"
93 depends on REISERFS_FS_XATTR
94 select FS_POSIX_ACL
95 help
96 Posix Access Control Lists (ACLs) support permissions for users and
97 groups beyond the owner/group/world scheme.
98
99 To learn more about Access Control Lists, visit the Posix ACLs for
100 Linux website <http://acl.bestbits.at/>.
101
102 If you don't know what Access Control Lists are, say N
103
104config REISERFS_FS_SECURITY
105 bool "ReiserFS Security Labels"
106 depends on REISERFS_FS_XATTR
107 help
108 Security labels support alternative access control models
109 implemented by security modules like SELinux. This option
110 enables an extended attribute handler for file security
111 labels in the ReiserFS filesystem.
112
113 If you are not using a security module that requires using
114 extended attributes for file security labels, say N.
115
116config JFS_FS
117 tristate "JFS filesystem support"
118 select NLS
119 help
120 This is a port of IBM's Journaled Filesystem . More information is
121 available in the file <file:Documentation/filesystems/jfs.txt>.
122
123 If you do not intend to use the JFS filesystem, say N.
124
125config JFS_POSIX_ACL
126 bool "JFS POSIX Access Control Lists"
127 depends on JFS_FS
128 select FS_POSIX_ACL
129 help
130 Posix Access Control Lists (ACLs) support permissions for users and
131 groups beyond the owner/group/world scheme.
132
133 To learn more about Access Control Lists, visit the Posix ACLs for
134 Linux website <http://acl.bestbits.at/>.
135
136 If you don't know what Access Control Lists are, say N
137
138config JFS_SECURITY
139 bool "JFS Security Labels"
140 depends on JFS_FS
141 help
142 Security labels support alternative access control models
143 implemented by security modules like SELinux. This option
144 enables an extended attribute handler for file security
145 labels in the jfs filesystem.
146
147 If you are not using a security module that requires using
148 extended attributes for file security labels, say N.
149
150config JFS_DEBUG
151 bool "JFS debugging"
152 depends on JFS_FS
153 help
154 If you are experiencing any problems with the JFS filesystem, say
155 Y here. This will result in additional debugging messages to be
156 written to the system log. Under normal circumstances, this
157 results in very little overhead.
158
159config JFS_STATISTICS
160 bool "JFS statistics"
161 depends on JFS_FS
162 help
163 Enabling this option will cause statistics from the JFS file system
164 to be made available to the user in the /proc/fs/jfs/ directory.
165 32
166config FS_POSIX_ACL 33config FS_POSIX_ACL
167# Posix ACL utility routines (for now, only ext2/ext3/jfs/reiserfs/nfs4) 34# Posix ACL utility routines (for now, only ext2/ext3/jfs/reiserfs/nfs4)
@@ -182,132 +49,12 @@ config FILE_LOCKING
182 49
183source "fs/xfs/Kconfig" 50source "fs/xfs/Kconfig"
184source "fs/gfs2/Kconfig" 51source "fs/gfs2/Kconfig"
185 52source "fs/ocfs2/Kconfig"
186config OCFS2_FS 53source "fs/btrfs/Kconfig"
187 tristate "OCFS2 file system support"
188 depends on NET && SYSFS
189 select CONFIGFS_FS
190 select JBD2
191 select CRC32
192 help
193 OCFS2 is a general purpose extent based shared disk cluster file
194 system with many similarities to ext3. It supports 64 bit inode
195 numbers, and has automatically extending metadata groups which may
196 also make it attractive for non-clustered use.
197
198 You'll want to install the ocfs2-tools package in order to at least
199 get "mount.ocfs2".
200
201 Project web page: http://oss.oracle.com/projects/ocfs2
202 Tools web page: http://oss.oracle.com/projects/ocfs2-tools
203 OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/
204
205 For more information on OCFS2, see the file
206 <file:Documentation/filesystems/ocfs2.txt>.
207
208config OCFS2_FS_O2CB
209 tristate "O2CB Kernelspace Clustering"
210 depends on OCFS2_FS
211 default y
212 help
213 OCFS2 includes a simple kernelspace clustering package, the OCFS2
214 Cluster Base. It only requires a very small userspace component
215 to configure it. This comes with the standard ocfs2-tools package.
216 O2CB is limited to maintaining a cluster for OCFS2 file systems.
217 It cannot manage any other cluster applications.
218
219 It is always safe to say Y here, as the clustering method is
220 run-time selectable.
221
222config OCFS2_FS_USERSPACE_CLUSTER
223 tristate "OCFS2 Userspace Clustering"
224 depends on OCFS2_FS && DLM
225 default y
226 help
227 This option will allow OCFS2 to use userspace clustering services
228 in conjunction with the DLM in fs/dlm. If you are using a
229 userspace cluster manager, say Y here.
230
231 It is safe to say Y, as the clustering method is run-time
232 selectable.
233
234config OCFS2_FS_STATS
235 bool "OCFS2 statistics"
236 depends on OCFS2_FS
237 default y
238 help
239 This option allows some fs statistics to be captured. Enabling
240 this option may increase the memory consumption.
241
242config OCFS2_DEBUG_MASKLOG
243 bool "OCFS2 logging support"
244 depends on OCFS2_FS
245 default y
246 help
247 The ocfs2 filesystem has an extensive logging system. The system
248 allows selection of events to log via files in /sys/o2cb/logmask/.
249 This option will enlarge your kernel, but it allows debugging of
250 ocfs2 filesystem issues.
251
252config OCFS2_DEBUG_FS
253 bool "OCFS2 expensive checks"
254 depends on OCFS2_FS
255 default n
256 help
257 This option will enable expensive consistency checks. Enable
258 this option for debugging only as it is likely to decrease
259 performance of the filesystem.
260
261config OCFS2_COMPAT_JBD
262 bool "Use JBD for compatibility"
263 depends on OCFS2_FS
264 default n
265 select JBD
266 help
267 The ocfs2 filesystem now uses JBD2 for its journalling. JBD2
268 is backwards compatible with JBD. It is safe to say N here.
269 However, if you really want to use the original JBD, say Y here.
270 54
271endif # BLOCK 55endif # BLOCK
272 56
273config DNOTIFY 57source "fs/notify/Kconfig"
274 bool "Dnotify support"
275 default y
276 help
277 Dnotify is a directory-based per-fd file change notification system
278 that uses signals to communicate events to user-space. There exist
279 superior alternatives, but some applications may still rely on
280 dnotify.
281
282 If unsure, say Y.
283
284config INOTIFY
285 bool "Inotify file change notification support"
286 default y
287 ---help---
288 Say Y here to enable inotify support. Inotify is a file change
289 notification system and a replacement for dnotify. Inotify fixes
290 numerous shortcomings in dnotify and introduces several new features
291 including multiple file events, one-shot support, and unmount
292 notification.
293
294 For more information, see <file:Documentation/filesystems/inotify.txt>
295
296 If unsure, say Y.
297
298config INOTIFY_USER
299 bool "Inotify support for userspace"
300 depends on INOTIFY
301 default y
302 ---help---
303 Say Y here to enable inotify support for userspace, including the
304 associated system calls. Inotify allows monitoring of both files and
305 directories via a single open fd. Events are read from the file
306 descriptor, which is also select()- and poll()-able.
307
308 For more information, see <file:Documentation/filesystems/inotify.txt>
309
310 If unsure, say Y.
311 58
312config QUOTA 59config QUOTA
313 bool "Quota support" 60 bool "Quota support"
@@ -340,6 +87,10 @@ config PRINT_QUOTA_WARNING
340 Note that this behavior is currently deprecated and may go away in 87 Note that this behavior is currently deprecated and may go away in
341 future. Please use notification via netlink socket instead. 88 future. Please use notification via netlink socket instead.
342 89
90# Generic support for tree structured quota files. Seleted when needed.
91config QUOTA_TREE
92 tristate
93
343config QFMT_V1 94config QFMT_V1
344 tristate "Old quota format support" 95 tristate "Old quota format support"
345 depends on QUOTA 96 depends on QUOTA
@@ -351,6 +102,7 @@ config QFMT_V1
351config QFMT_V2 102config QFMT_V2
352 tristate "Quota format v2 support" 103 tristate "Quota format v2 support"
353 depends on QUOTA 104 depends on QUOTA
105 select QUOTA_TREE
354 help 106 help
355 This quota format allows using quotas with 32-bit UIDs/GIDs. If you 107 This quota format allows using quotas with 32-bit UIDs/GIDs. If you
356 need this functionality say Y here. 108 need this functionality say Y here.
@@ -360,64 +112,9 @@ config QUOTACTL
360 depends on XFS_QUOTA || QUOTA 112 depends on XFS_QUOTA || QUOTA
361 default y 113 default y
362 114
363config AUTOFS_FS 115source "fs/autofs/Kconfig"
364 tristate "Kernel automounter support" 116source "fs/autofs4/Kconfig"
365 help 117source "fs/fuse/Kconfig"
366 The automounter is a tool to automatically mount remote file systems
367 on demand. This implementation is partially kernel-based to reduce
368 overhead in the already-mounted case; this is unlike the BSD
369 automounter (amd), which is a pure user space daemon.
370
371 To use the automounter you need the user-space tools from the autofs
372 package; you can find the location in <file:Documentation/Changes>.
373 You also want to answer Y to "NFS file system support", below.
374
375 If you want to use the newer version of the automounter with more
376 features, say N here and say Y to "Kernel automounter v4 support",
377 below.
378
379 To compile this support as a module, choose M here: the module will be
380 called autofs.
381
382 If you are not a part of a fairly large, distributed network, you
383 probably do not need an automounter, and can say N here.
384
385config AUTOFS4_FS
386 tristate "Kernel automounter version 4 support (also supports v3)"
387 help
388 The automounter is a tool to automatically mount remote file systems
389 on demand. This implementation is partially kernel-based to reduce
390 overhead in the already-mounted case; this is unlike the BSD
391 automounter (amd), which is a pure user space daemon.
392
393 To use the automounter you need the user-space tools from
394 <ftp://ftp.kernel.org/pub/linux/daemons/autofs/v4/>; you also
395 want to answer Y to "NFS file system support", below.
396
397 To compile this support as a module, choose M here: the module will be
398 called autofs4. You will need to add "alias autofs autofs4" to your
399 modules configuration file.
400
401 If you are not a part of a fairly large, distributed network or
402 don't have a laptop which needs to dynamically reconfigure to the
403 local network, you probably do not need an automounter, and can say
404 N here.
405
406config FUSE_FS
407 tristate "FUSE (Filesystem in Userspace) support"
408 help
409 With FUSE it is possible to implement a fully functional filesystem
410 in a userspace program.
411
412 There's also companion library: libfuse. This library along with
413 utilities is available from the FUSE homepage:
414 <http://fuse.sourceforge.net/>
415
416 See <file:Documentation/filesystems/fuse.txt> for more information.
417 See <file:Documentation/Changes> for needed library/utility version.
418
419 If you want to develop a userspace FS, or if you want to use
420 a filesystem based on FUSE, answer Y or M.
421 118
422config GENERIC_ACL 119config GENERIC_ACL
423 bool 120 bool
@@ -426,64 +123,8 @@ config GENERIC_ACL
426if BLOCK 123if BLOCK
427menu "CD-ROM/DVD Filesystems" 124menu "CD-ROM/DVD Filesystems"
428 125
429config ISO9660_FS 126source "fs/isofs/Kconfig"
430 tristate "ISO 9660 CDROM file system support" 127source "fs/udf/Kconfig"
431 help
432 This is the standard file system used on CD-ROMs. It was previously
433 known as "High Sierra File System" and is called "hsfs" on other
434 Unix systems. The so-called Rock-Ridge extensions which allow for
435 long Unix filenames and symbolic links are also supported by this
436 driver. If you have a CD-ROM drive and want to do more with it than
437 just listen to audio CDs and watch its LEDs, say Y (and read
438 <file:Documentation/filesystems/isofs.txt> and the CD-ROM-HOWTO,
439 available from <http://www.tldp.org/docs.html#howto>), thereby
440 enlarging your kernel by about 27 KB; otherwise say N.
441
442 To compile this file system support as a module, choose M here: the
443 module will be called isofs.
444
445config JOLIET
446 bool "Microsoft Joliet CDROM extensions"
447 depends on ISO9660_FS
448 select NLS
449 help
450 Joliet is a Microsoft extension for the ISO 9660 CD-ROM file system
451 which allows for long filenames in unicode format (unicode is the
452 new 16 bit character code, successor to ASCII, which encodes the
453 characters of almost all languages of the world; see
454 <http://www.unicode.org/> for more information). Say Y here if you
455 want to be able to read Joliet CD-ROMs under Linux.
456
457config ZISOFS
458 bool "Transparent decompression extension"
459 depends on ISO9660_FS
460 select ZLIB_INFLATE
461 help
462 This is a Linux-specific extension to RockRidge which lets you store
463 data in compressed form on a CD-ROM and have it transparently
464 decompressed when the CD-ROM is accessed. See
465 <http://www.kernel.org/pub/linux/utils/fs/zisofs/> for the tools
466 necessary to create such a filesystem. Say Y here if you want to be
467 able to read such compressed CD-ROMs.
468
469config UDF_FS
470 tristate "UDF file system support"
471 select CRC_ITU_T
472 help
473 This is the new file system used on some CD-ROMs and DVDs. Say Y if
474 you intend to mount DVD discs or CDRW's written in packet mode, or
475 if written to by other UDF utilities, such as DirectCD.
476 Please read <file:Documentation/filesystems/udf.txt>.
477
478 To compile this file system support as a module, choose M here: the
479 module will be called udf.
480
481 If unsure, say N.
482
483config UDF_NLS
484 bool
485 default y
486 depends on (UDF_FS=m && NLS) || (UDF_FS=y && NLS=y)
487 128
488endmenu 129endmenu
489endif # BLOCK 130endif # BLOCK
@@ -491,182 +132,8 @@ endif # BLOCK
491if BLOCK 132if BLOCK
492menu "DOS/FAT/NT Filesystems" 133menu "DOS/FAT/NT Filesystems"
493 134
494config FAT_FS 135source "fs/fat/Kconfig"
495 tristate 136source "fs/ntfs/Kconfig"
496 select NLS
497 help
498 If you want to use one of the FAT-based file systems (the MS-DOS and
499 VFAT (Windows 95) file systems), then you must say Y or M here
500 to include FAT support. You will then be able to mount partitions or
501 diskettes with FAT-based file systems and transparently access the
502 files on them, i.e. MSDOS files will look and behave just like all
503 other Unix files.
504
505 This FAT support is not a file system in itself, it only provides
506 the foundation for the other file systems. You will have to say Y or
507 M to at least one of "MSDOS fs support" or "VFAT fs support" in
508 order to make use of it.
509
510 Another way to read and write MSDOS floppies and hard drive
511 partitions from within Linux (but not transparently) is with the
512 mtools ("man mtools") program suite. You don't need to say Y here in
513 order to do that.
514
515 If you need to move large files on floppies between a DOS and a
516 Linux box, say Y here, mount the floppy under Linux with an MSDOS
517 file system and use GNU tar's M option. GNU tar is a program
518 available for Unix and DOS ("man tar" or "info tar").
519
520 The FAT support will enlarge your kernel by about 37 KB. If unsure,
521 say Y.
522
523 To compile this as a module, choose M here: the module will be called
524 fat. Note that if you compile the FAT support as a module, you
525 cannot compile any of the FAT-based file systems into the kernel
526 -- they will have to be modules as well.
527
528config MSDOS_FS
529 tristate "MSDOS fs support"
530 select FAT_FS
531 help
532 This allows you to mount MSDOS partitions of your hard drive (unless
533 they are compressed; to access compressed MSDOS partitions under
534 Linux, you can either use the DOS emulator DOSEMU, described in the
535 DOSEMU-HOWTO, available from
536 <http://www.tldp.org/docs.html#howto>, or try dmsdosfs in
537 <ftp://ibiblio.org/pub/Linux/system/filesystems/dosfs/>. If you
538 intend to use dosemu with a non-compressed MSDOS partition, say Y
539 here) and MSDOS floppies. This means that file access becomes
540 transparent, i.e. the MSDOS files look and behave just like all
541 other Unix files.
542
543 If you have Windows 95 or Windows NT installed on your MSDOS
544 partitions, you should use the VFAT file system (say Y to "VFAT fs
545 support" below), or you will not be able to see the long filenames
546 generated by Windows 95 / Windows NT.
547
548 This option will enlarge your kernel by about 7 KB. If unsure,
549 answer Y. This will only work if you said Y to "DOS FAT fs support"
550 as well. To compile this as a module, choose M here: the module will
551 be called msdos.
552
553config VFAT_FS
554 tristate "VFAT (Windows-95) fs support"
555 select FAT_FS
556 help
557 This option provides support for normal Windows file systems with
558 long filenames. That includes non-compressed FAT-based file systems
559 used by Windows 95, Windows 98, Windows NT 4.0, and the Unix
560 programs from the mtools package.
561
562 The VFAT support enlarges your kernel by about 10 KB and it only
563 works if you said Y to the "DOS FAT fs support" above. Please read
564 the file <file:Documentation/filesystems/vfat.txt> for details. If
565 unsure, say Y.
566
567 To compile this as a module, choose M here: the module will be called
568 vfat.
569
570config FAT_DEFAULT_CODEPAGE
571 int "Default codepage for FAT"
572 depends on MSDOS_FS || VFAT_FS
573 default 437
574 help
575 This option should be set to the codepage of your FAT filesystems.
576 It can be overridden with the "codepage" mount option.
577 See <file:Documentation/filesystems/vfat.txt> for more information.
578
579config FAT_DEFAULT_IOCHARSET
580 string "Default iocharset for FAT"
581 depends on VFAT_FS
582 default "iso8859-1"
583 help
584 Set this to the default input/output character set you'd
585 like FAT to use. It should probably match the character set
586 that most of your FAT filesystems use, and can be overridden
587 with the "iocharset" mount option for FAT filesystems.
588 Note that "utf8" is not recommended for FAT filesystems.
589 If unsure, you shouldn't set "utf8" here.
590 See <file:Documentation/filesystems/vfat.txt> for more information.
591
592config NTFS_FS
593 tristate "NTFS file system support"
594 select NLS
595 help
596 NTFS is the file system of Microsoft Windows NT, 2000, XP and 2003.
597
598 Saying Y or M here enables read support. There is partial, but
599 safe, write support available. For write support you must also
600 say Y to "NTFS write support" below.
601
602 There are also a number of user-space tools available, called
603 ntfsprogs. These include ntfsundelete and ntfsresize, that work
604 without NTFS support enabled in the kernel.
605
606 This is a rewrite from scratch of Linux NTFS support and replaced
607 the old NTFS code starting with Linux 2.5.11. A backport to
608 the Linux 2.4 kernel series is separately available as a patch
609 from the project web site.
610
611 For more information see <file:Documentation/filesystems/ntfs.txt>
612 and <http://www.linux-ntfs.org/>.
613
614 To compile this file system support as a module, choose M here: the
615 module will be called ntfs.
616
617 If you are not using Windows NT, 2000, XP or 2003 in addition to
618 Linux on your computer it is safe to say N.
619
620config NTFS_DEBUG
621 bool "NTFS debugging support"
622 depends on NTFS_FS
623 help
624 If you are experiencing any problems with the NTFS file system, say
625 Y here. This will result in additional consistency checks to be
626 performed by the driver as well as additional debugging messages to
627 be written to the system log. Note that debugging messages are
628 disabled by default. To enable them, supply the option debug_msgs=1
629 at the kernel command line when booting the kernel or as an option
630 to insmod when loading the ntfs module. Once the driver is active,
631 you can enable debugging messages by doing (as root):
632 echo 1 > /proc/sys/fs/ntfs-debug
633 Replacing the "1" with "0" would disable debug messages.
634
635 If you leave debugging messages disabled, this results in little
636 overhead, but enabling debug messages results in very significant
637 slowdown of the system.
638
639 When reporting bugs, please try to have available a full dump of
640 debugging messages while the misbehaviour was occurring.
641
642config NTFS_RW
643 bool "NTFS write support"
644 depends on NTFS_FS
645 help
646 This enables the partial, but safe, write support in the NTFS driver.
647
648 The only supported operation is overwriting existing files, without
649 changing the file length. No file or directory creation, deletion or
650 renaming is possible. Note only non-resident files can be written to
651 so you may find that some very small files (<500 bytes or so) cannot
652 be written to.
653
654 While we cannot guarantee that it will not damage any data, we have
655 so far not received a single report where the driver would have
656 damaged someones data so we assume it is perfectly safe to use.
657
658 Note: While write support is safe in this version (a rewrite from
659 scratch of the NTFS support), it should be noted that the old NTFS
660 write support, included in Linux 2.5.10 and before (since 1997),
661 is not safe.
662
663 This is currently useful with TopologiLinux. TopologiLinux is run
664 on top of any DOS/Microsoft Windows system without partitioning your
665 hard disk. Unlike other Linux distributions TopologiLinux does not
666 need its own partition. For more information see
667 <http://topologi-linux.sourceforge.net/>
668
669 It is perfectly safe to say N here.
670 137
671endmenu 138endmenu
672endif # BLOCK 139endif # BLOCK
@@ -674,30 +141,7 @@ endif # BLOCK
674menu "Pseudo filesystems" 141menu "Pseudo filesystems"
675 142
676source "fs/proc/Kconfig" 143source "fs/proc/Kconfig"
677 144source "fs/sysfs/Kconfig"
678config SYSFS
679 bool "sysfs file system support" if EMBEDDED
680 default y
681 help
682 The sysfs filesystem is a virtual filesystem that the kernel uses to
683 export internal kernel objects, their attributes, and their
684 relationships to one another.
685
686 Users can use sysfs to ascertain useful information about the running
687 kernel, such as the devices the kernel has discovered on each bus and
688 which driver each is bound to. sysfs can also be used to tune devices
689 and other kernel subsystems.
690
691 Some system agents rely on the information in sysfs to operate.
692 /sbin/hotplug uses device and object attributes in sysfs to assist in
693 delegating policy decisions, like persistently naming devices.
694
695 sysfs is currently used by the block subsystem to mount the root
696 partition. If sysfs is disabled you must specify the boot device on
697 the kernel boot command line via its major and minor numbers. For
698 example, "root=03:01" for /dev/hda1.
699
700 Designers of embedded systems may wish to say N here to conserve space.
701 145
702config TMPFS 146config TMPFS
703 bool "Virtual memory file system support (former shm fs)" 147 bool "Virtual memory file system support (former shm fs)"
@@ -738,391 +182,48 @@ config HUGETLBFS
738config HUGETLB_PAGE 182config HUGETLB_PAGE
739 def_bool HUGETLBFS 183 def_bool HUGETLBFS
740 184
741config CONFIGFS_FS 185source "fs/configfs/Kconfig"
742 tristate "Userspace-driven configuration filesystem"
743 depends on SYSFS
744 help
745 configfs is a ram-based filesystem that provides the converse
746 of sysfs's functionality. Where sysfs is a filesystem-based
747 view of kernel objects, configfs is a filesystem-based manager
748 of kernel objects, or config_items.
749
750 Both sysfs and configfs can and should exist together on the
751 same system. One is not a replacement for the other.
752 186
753endmenu 187endmenu
754 188
755menu "Miscellaneous filesystems" 189menuconfig MISC_FILESYSTEMS
756 190 bool "Miscellaneous filesystems"
757config ADFS_FS 191 default y
758 tristate "ADFS file system support (EXPERIMENTAL)" 192 ---help---
759 depends on BLOCK && EXPERIMENTAL 193 Say Y here to get to see options for various miscellaneous
760 help 194 filesystems, such as filesystems that came from other
761 The Acorn Disc Filing System is the standard file system of the 195 operating systems.
762 RiscOS operating system which runs on Acorn's ARM-based Risc PC
763 systems and the Acorn Archimedes range of machines. If you say Y
764 here, Linux will be able to read from ADFS partitions on hard drives
765 and from ADFS-formatted floppy discs. If you also want to be able to
766 write to those devices, say Y to "ADFS write support" below.
767
768 The ADFS partition should be the first partition (i.e.,
769 /dev/[hs]d?1) on each of your drives. Please read the file
770 <file:Documentation/filesystems/adfs.txt> for further details.
771
772 To compile this code as a module, choose M here: the module will be
773 called adfs.
774
775 If unsure, say N.
776
777config ADFS_FS_RW
778 bool "ADFS write support (DANGEROUS)"
779 depends on ADFS_FS
780 help
781 If you say Y here, you will be able to write to ADFS partitions on
782 hard drives and ADFS-formatted floppy disks. This is experimental
783 codes, so if you're unsure, say N.
784
785config AFFS_FS
786 tristate "Amiga FFS file system support (EXPERIMENTAL)"
787 depends on BLOCK && EXPERIMENTAL
788 help
789 The Fast File System (FFS) is the common file system used on hard
790 disks by Amiga(tm) systems since AmigaOS Version 1.3 (34.20). Say Y
791 if you want to be able to read and write files from and to an Amiga
792 FFS partition on your hard drive. Amiga floppies however cannot be
793 read with this driver due to an incompatibility of the floppy
794 controller used in an Amiga and the standard floppy controller in
795 PCs and workstations. Read <file:Documentation/filesystems/affs.txt>
796 and <file:fs/affs/Changes>.
797
798 With this driver you can also mount disk files used by Bernd
799 Schmidt's Un*X Amiga Emulator
800 (<http://www.freiburg.linux.de/~uae/>).
801 If you want to do this, you will also need to say Y or M to "Loop
802 device support", above.
803
804 To compile this file system support as a module, choose M here: the
805 module will be called affs. If unsure, say N.
806
807config ECRYPT_FS
808 tristate "eCrypt filesystem layer support (EXPERIMENTAL)"
809 depends on EXPERIMENTAL && KEYS && CRYPTO && NET
810 help
811 Encrypted filesystem that operates on the VFS layer. See
812 <file:Documentation/filesystems/ecryptfs.txt> to learn more about
813 eCryptfs. Userspace components are required and can be
814 obtained from <http://ecryptfs.sf.net>.
815
816 To compile this file system support as a module, choose M here: the
817 module will be called ecryptfs.
818
819config HFS_FS
820 tristate "Apple Macintosh file system support (EXPERIMENTAL)"
821 depends on BLOCK && EXPERIMENTAL
822 select NLS
823 help
824 If you say Y here, you will be able to mount Macintosh-formatted
825 floppy disks and hard drive partitions with full read-write access.
826 Please read <file:Documentation/filesystems/hfs.txt> to learn about
827 the available mount options.
828
829 To compile this file system support as a module, choose M here: the
830 module will be called hfs.
831
832config HFSPLUS_FS
833 tristate "Apple Extended HFS file system support"
834 depends on BLOCK
835 select NLS
836 select NLS_UTF8
837 help
838 If you say Y here, you will be able to mount extended format
839 Macintosh-formatted hard drive partitions with full read-write access.
840
841 This file system is often called HFS+ and was introduced with
842 MacOS 8. It includes all Mac specific filesystem data such as
843 data forks and creator codes, but it also has several UNIX
844 style features such as file ownership and permissions.
845
846config BEFS_FS
847 tristate "BeOS file system (BeFS) support (read only) (EXPERIMENTAL)"
848 depends on BLOCK && EXPERIMENTAL
849 select NLS
850 help
851 The BeOS File System (BeFS) is the native file system of Be, Inc's
852 BeOS. Notable features include support for arbitrary attributes
853 on files and directories, and database-like indices on selected
854 attributes. (Also note that this driver doesn't make those features
855 available at this time). It is a 64 bit filesystem, so it supports
856 extremely large volumes and files.
857
858 If you use this filesystem, you should also say Y to at least one
859 of the NLS (native language support) options below.
860
861 If you don't know what this is about, say N.
862
863 To compile this as a module, choose M here: the module will be
864 called befs.
865
866config BEFS_DEBUG
867 bool "Debug BeFS"
868 depends on BEFS_FS
869 help
870 If you say Y here, you can use the 'debug' mount option to enable
871 debugging output from the driver.
872
873config BFS_FS
874 tristate "BFS file system support (EXPERIMENTAL)"
875 depends on BLOCK && EXPERIMENTAL
876 help
877 Boot File System (BFS) is a file system used under SCO UnixWare to
878 allow the bootloader access to the kernel image and other important
879 files during the boot process. It is usually mounted under /stand
880 and corresponds to the slice marked as "STAND" in the UnixWare
881 partition. You should say Y if you want to read or write the files
882 on your /stand slice from within Linux. You then also need to say Y
883 to "UnixWare slices support", below. More information about the BFS
884 file system is contained in the file
885 <file:Documentation/filesystems/bfs.txt>.
886
887 If you don't know what this is about, say N.
888
889 To compile this as a module, choose M here: the module will be called
890 bfs. Note that the file system of your root partition (the one
891 containing the directory /) cannot be compiled as a module.
892
893
894 196
895config EFS_FS 197 This option alone does not add any kernel code.
896 tristate "EFS file system support (read only) (EXPERIMENTAL)"
897 depends on BLOCK && EXPERIMENTAL
898 help
899 EFS is an older file system used for non-ISO9660 CD-ROMs and hard
900 disk partitions by SGI's IRIX operating system (IRIX 6.0 and newer
901 uses the XFS file system for hard disk partitions however).
902 198
903 This implementation only offers read-only access. If you don't know 199 If you say N, all options in this submenu will be skipped and
904 what all this is about, it's safe to say N. For more information 200 disabled; if unsure, say Y here.
905 about EFS see its home page at <http://aeschi.ch.eu.org/efs/>.
906 201
907 To compile the EFS file system support as a module, choose M here: the 202if MISC_FILESYSTEMS
908 module will be called efs.
909 203
204source "fs/adfs/Kconfig"
205source "fs/affs/Kconfig"
206source "fs/ecryptfs/Kconfig"
207source "fs/hfs/Kconfig"
208source "fs/hfsplus/Kconfig"
209source "fs/befs/Kconfig"
210source "fs/bfs/Kconfig"
211source "fs/efs/Kconfig"
910source "fs/jffs2/Kconfig" 212source "fs/jffs2/Kconfig"
911# UBIFS File system configuration 213# UBIFS File system configuration
912source "fs/ubifs/Kconfig" 214source "fs/ubifs/Kconfig"
913 215source "fs/cramfs/Kconfig"
914config CRAMFS 216source "fs/squashfs/Kconfig"
915 tristate "Compressed ROM file system support (cramfs)" 217source "fs/freevxfs/Kconfig"
916 depends on BLOCK 218source "fs/minix/Kconfig"
917 select ZLIB_INFLATE 219source "fs/omfs/Kconfig"
918 help 220source "fs/hpfs/Kconfig"
919 Saying Y here includes support for CramFs (Compressed ROM File 221source "fs/qnx4/Kconfig"
920 System). CramFs is designed to be a simple, small, and compressed 222source "fs/romfs/Kconfig"
921 file system for ROM based embedded systems. CramFs is read-only, 223source "fs/sysv/Kconfig"
922 limited to 256MB file systems (with 16MB files), and doesn't support 224source "fs/ufs/Kconfig"
923 16/32 bits uid/gid, hard links and timestamps. 225
924 226endif # MISC_FILESYSTEMS
925 See <file:Documentation/filesystems/cramfs.txt> and
926 <file:fs/cramfs/README> for further information.
927
928 To compile this as a module, choose M here: the module will be called
929 cramfs. Note that the root file system (the one containing the
930 directory /) cannot be compiled as a module.
931
932 If unsure, say N.
933
934config VXFS_FS
935 tristate "FreeVxFS file system support (VERITAS VxFS(TM) compatible)"
936 depends on BLOCK
937 help
938 FreeVxFS is a file system driver that support the VERITAS VxFS(TM)
939 file system format. VERITAS VxFS(TM) is the standard file system
940 of SCO UnixWare (and possibly others) and optionally available
941 for Sunsoft Solaris, HP-UX and many other operating systems.
942 Currently only readonly access is supported.
943
944 NOTE: the file system type as used by mount(1), mount(2) and
945 fstab(5) is 'vxfs' as it describes the file system format, not
946 the actual driver.
947
948 To compile this as a module, choose M here: the module will be
949 called freevxfs. If unsure, say N.
950
951config MINIX_FS
952 tristate "Minix file system support"
953 depends on BLOCK
954 help
955 Minix is a simple operating system used in many classes about OS's.
956 The minix file system (method to organize files on a hard disk
957 partition or a floppy disk) was the original file system for Linux,
958 but has been superseded by the second extended file system ext2fs.
959 You don't want to use the minix file system on your hard disk
960 because of certain built-in restrictions, but it is sometimes found
961 on older Linux floppy disks. This option will enlarge your kernel
962 by about 28 KB. If unsure, say N.
963
964 To compile this file system support as a module, choose M here: the
965 module will be called minix. Note that the file system of your root
966 partition (the one containing the directory /) cannot be compiled as
967 a module.
968
969config OMFS_FS
970 tristate "SonicBlue Optimized MPEG File System support"
971 depends on BLOCK
972 select CRC_ITU_T
973 help
974 This is the proprietary file system used by the Rio Karma music
975 player and ReplayTV DVR. Despite the name, this filesystem is not
976 more efficient than a standard FS for MPEG files, in fact likely
977 the opposite is true. Say Y if you have either of these devices
978 and wish to mount its disk.
979
980 To compile this file system support as a module, choose M here: the
981 module will be called omfs. If unsure, say N.
982
983config HPFS_FS
984 tristate "OS/2 HPFS file system support"
985 depends on BLOCK
986 help
987 OS/2 is IBM's operating system for PC's, the same as Warp, and HPFS
988 is the file system used for organizing files on OS/2 hard disk
989 partitions. Say Y if you want to be able to read files from and
990 write files to an OS/2 HPFS partition on your hard drive. OS/2
991 floppies however are in regular MSDOS format, so you don't need this
992 option in order to be able to read them. Read
993 <file:Documentation/filesystems/hpfs.txt>.
994
995 To compile this file system support as a module, choose M here: the
996 module will be called hpfs. If unsure, say N.
997
998
999config QNX4FS_FS
1000 tristate "QNX4 file system support (read only)"
1001 depends on BLOCK
1002 help
1003 This is the file system used by the real-time operating systems
1004 QNX 4 and QNX 6 (the latter is also called QNX RTP).
1005 Further information is available at <http://www.qnx.com/>.
1006 Say Y if you intend to mount QNX hard disks or floppies.
1007 Unless you say Y to "QNX4FS read-write support" below, you will
1008 only be able to read these file systems.
1009
1010 To compile this file system support as a module, choose M here: the
1011 module will be called qnx4.
1012
1013 If you don't know whether you need it, then you don't need it:
1014 answer N.
1015
1016config QNX4FS_RW
1017 bool "QNX4FS write support (DANGEROUS)"
1018 depends on QNX4FS_FS && EXPERIMENTAL && BROKEN
1019 help
1020 Say Y if you want to test write support for QNX4 file systems.
1021
1022 It's currently broken, so for now:
1023 answer N.
1024
1025config ROMFS_FS
1026 tristate "ROM file system support"
1027 depends on BLOCK
1028 ---help---
1029 This is a very small read-only file system mainly intended for
1030 initial ram disks of installation disks, but it could be used for
1031 other read-only media as well. Read
1032 <file:Documentation/filesystems/romfs.txt> for details.
1033
1034 To compile this file system support as a module, choose M here: the
1035 module will be called romfs. Note that the file system of your
1036 root partition (the one containing the directory /) cannot be a
1037 module.
1038
1039 If you don't know whether you need it, then you don't need it:
1040 answer N.
1041
1042
1043config SYSV_FS
1044 tristate "System V/Xenix/V7/Coherent file system support"
1045 depends on BLOCK
1046 help
1047 SCO, Xenix and Coherent are commercial Unix systems for Intel
1048 machines, and Version 7 was used on the DEC PDP-11. Saying Y
1049 here would allow you to read from their floppies and hard disk
1050 partitions.
1051
1052 If you have floppies or hard disk partitions like that, it is likely
1053 that they contain binaries from those other Unix systems; in order
1054 to run these binaries, you will want to install linux-abi which is
1055 a set of kernel modules that lets you run SCO, Xenix, Wyse,
1056 UnixWare, Dell Unix and System V programs under Linux. It is
1057 available via FTP (user: ftp) from
1058 <ftp://ftp.openlinux.org/pub/people/hch/linux-abi/>).
1059 NOTE: that will work only for binaries from Intel-based systems;
1060 PDP ones will have to wait until somebody ports Linux to -11 ;-)
1061
1062 If you only intend to mount files from some other Unix over the
1063 network using NFS, you don't need the System V file system support
1064 (but you need NFS file system support obviously).
1065
1066 Note that this option is generally not needed for floppies, since a
1067 good portable way to transport files and directories between unixes
1068 (and even other operating systems) is given by the tar program ("man
1069 tar" or preferably "info tar"). Note also that this option has
1070 nothing whatsoever to do with the option "System V IPC". Read about
1071 the System V file system in
1072 <file:Documentation/filesystems/sysv-fs.txt>.
1073 Saying Y here will enlarge your kernel by about 27 KB.
1074
1075 To compile this as a module, choose M here: the module will be called
1076 sysv.
1077
1078 If you haven't heard about all of this before, it's safe to say N.
1079
1080
1081config UFS_FS
1082 tristate "UFS file system support (read only)"
1083 depends on BLOCK
1084 help
1085 BSD and derivate versions of Unix (such as SunOS, FreeBSD, NetBSD,
1086 OpenBSD and NeXTstep) use a file system called UFS. Some System V
1087 Unixes can create and mount hard disk partitions and diskettes using
1088 this file system as well. Saying Y here will allow you to read from
1089 these partitions; if you also want to write to them, say Y to the
1090 experimental "UFS file system write support", below. Please read the
1091 file <file:Documentation/filesystems/ufs.txt> for more information.
1092
1093 The recently released UFS2 variant (used in FreeBSD 5.x) is
1094 READ-ONLY supported.
1095
1096 Note that this option is generally not needed for floppies, since a
1097 good portable way to transport files and directories between unixes
1098 (and even other operating systems) is given by the tar program ("man
1099 tar" or preferably "info tar").
1100
1101 When accessing NeXTstep files, you may need to convert them from the
1102 NeXT character set to the Latin1 character set; use the program
1103 recode ("info recode") for this purpose.
1104
1105 To compile the UFS file system support as a module, choose M here: the
1106 module will be called ufs.
1107
1108 If you haven't heard about all of this before, it's safe to say N.
1109
1110config UFS_FS_WRITE
1111 bool "UFS file system write support (DANGEROUS)"
1112 depends on UFS_FS && EXPERIMENTAL
1113 help
1114 Say Y here if you want to try writing to UFS partitions. This is
1115 experimental, so you should back up your UFS partitions beforehand.
1116
1117config UFS_DEBUG
1118 bool "UFS debugging"
1119 depends on UFS_FS
1120 help
1121 If you are experiencing any problems with the UFS filesystem, say
1122 Y here. This will result in _many_ additional debugging messages to be
1123 written to the system log.
1124
1125endmenu
1126 227
1127menuconfig NETWORK_FILESYSTEMS 228menuconfig NETWORK_FILESYSTEMS
1128 bool "Network File Systems" 229 bool "Network File Systems"
@@ -1140,173 +241,8 @@ menuconfig NETWORK_FILESYSTEMS
1140 241
1141if NETWORK_FILESYSTEMS 242if NETWORK_FILESYSTEMS
1142 243
1143config NFS_FS 244source "fs/nfs/Kconfig"
1144 tristate "NFS client support" 245source "fs/nfsd/Kconfig"
1145 depends on INET
1146 select LOCKD
1147 select SUNRPC
1148 select NFS_ACL_SUPPORT if NFS_V3_ACL
1149 help
1150 Choose Y here if you want to access files residing on other
1151 computers using Sun's Network File System protocol. To compile
1152 this file system support as a module, choose M here: the module
1153 will be called nfs.
1154
1155 To mount file systems exported by NFS servers, you also need to
1156 install the user space mount.nfs command which can be found in
1157 the Linux nfs-utils package, available from http://linux-nfs.org/.
1158 Information about using the mount command is available in the
1159 mount(8) man page. More detail about the Linux NFS client
1160 implementation is available via the nfs(5) man page.
1161
1162 Below you can choose which versions of the NFS protocol are
1163 available in the kernel to mount NFS servers. Support for NFS
1164 version 2 (RFC 1094) is always available when NFS_FS is selected.
1165
1166 To configure a system which mounts its root file system via NFS
1167 at boot time, say Y here, select "Kernel level IP
1168 autoconfiguration" in the NETWORK menu, and select "Root file
1169 system on NFS" below. You cannot compile this file system as a
1170 module in this case.
1171
1172 If unsure, say N.
1173
1174config NFS_V3
1175 bool "NFS client support for NFS version 3"
1176 depends on NFS_FS
1177 help
1178 This option enables support for version 3 of the NFS protocol
1179 (RFC 1813) in the kernel's NFS client.
1180
1181 If unsure, say Y.
1182
1183config NFS_V3_ACL
1184 bool "NFS client support for the NFSv3 ACL protocol extension"
1185 depends on NFS_V3
1186 help
1187 Some NFS servers support an auxiliary NFSv3 ACL protocol that
1188 Sun added to Solaris but never became an official part of the
1189 NFS version 3 protocol. This protocol extension allows
1190 applications on NFS clients to manipulate POSIX Access Control
1191 Lists on files residing on NFS servers. NFS servers enforce
1192 ACLs on local files whether this protocol is available or not.
1193
1194 Choose Y here if your NFS server supports the Solaris NFSv3 ACL
1195 protocol extension and you want your NFS client to allow
1196 applications to access and modify ACLs on files on the server.
1197
1198 Most NFS servers don't support the Solaris NFSv3 ACL protocol
1199 extension. You can choose N here or specify the "noacl" mount
1200 option to prevent your NFS client from trying to use the NFSv3
1201 ACL protocol.
1202
1203 If unsure, say N.
1204
1205config NFS_V4
1206 bool "NFS client support for NFS version 4 (EXPERIMENTAL)"
1207 depends on NFS_FS && EXPERIMENTAL
1208 select RPCSEC_GSS_KRB5
1209 help
1210 This option enables support for version 4 of the NFS protocol
1211 (RFC 3530) in the kernel's NFS client.
1212
1213 To mount NFS servers using NFSv4, you also need to install user
1214 space programs which can be found in the Linux nfs-utils package,
1215 available from http://linux-nfs.org/.
1216
1217 If unsure, say N.
1218
1219config ROOT_NFS
1220 bool "Root file system on NFS"
1221 depends on NFS_FS=y && IP_PNP
1222 help
1223 If you want your system to mount its root file system via NFS,
1224 choose Y here. This is common practice for managing systems
1225 without local permanent storage. For details, read
1226 <file:Documentation/filesystems/nfsroot.txt>.
1227
1228 Most people say N here.
1229
1230config NFSD
1231 tristate "NFS server support"
1232 depends on INET
1233 select LOCKD
1234 select SUNRPC
1235 select EXPORTFS
1236 select NFS_ACL_SUPPORT if NFSD_V2_ACL
1237 help
1238 Choose Y here if you want to allow other computers to access
1239 files residing on this system using Sun's Network File System
1240 protocol. To compile the NFS server support as a module,
1241 choose M here: the module will be called nfsd.
1242
1243 You may choose to use a user-space NFS server instead, in which
1244 case you can choose N here.
1245
1246 To export local file systems using NFS, you also need to install
1247 user space programs which can be found in the Linux nfs-utils
1248 package, available from http://linux-nfs.org/. More detail about
1249 the Linux NFS server implementation is available via the
1250 exports(5) man page.
1251
1252 Below you can choose which versions of the NFS protocol are
1253 available to clients mounting the NFS server on this system.
1254 Support for NFS version 2 (RFC 1094) is always available when
1255 CONFIG_NFSD is selected.
1256
1257 If unsure, say N.
1258
1259config NFSD_V2_ACL
1260 bool
1261 depends on NFSD
1262
1263config NFSD_V3
1264 bool "NFS server support for NFS version 3"
1265 depends on NFSD
1266 help
1267 This option enables support in your system's NFS server for
1268 version 3 of the NFS protocol (RFC 1813).
1269
1270 If unsure, say Y.
1271
1272config NFSD_V3_ACL
1273 bool "NFS server support for the NFSv3 ACL protocol extension"
1274 depends on NFSD_V3
1275 select NFSD_V2_ACL
1276 help
1277 Solaris NFS servers support an auxiliary NFSv3 ACL protocol that
1278 never became an official part of the NFS version 3 protocol.
1279 This protocol extension allows applications on NFS clients to
1280 manipulate POSIX Access Control Lists on files residing on NFS
1281 servers. NFS servers enforce POSIX ACLs on local files whether
1282 this protocol is available or not.
1283
1284 This option enables support in your system's NFS server for the
1285 NFSv3 ACL protocol extension allowing NFS clients to manipulate
1286 POSIX ACLs on files exported by your system's NFS server. NFS
1287 clients which support the Solaris NFSv3 ACL protocol can then
1288 access and modify ACLs on your NFS server.
1289
1290 To store ACLs on your NFS server, you also need to enable ACL-
1291 related CONFIG options for your local file systems of choice.
1292
1293 If unsure, say N.
1294
1295config NFSD_V4
1296 bool "NFS server support for NFS version 4 (EXPERIMENTAL)"
1297 depends on NFSD && PROC_FS && EXPERIMENTAL
1298 select NFSD_V3
1299 select FS_POSIX_ACL
1300 select RPCSEC_GSS_KRB5
1301 help
1302 This option enables support in your system's NFS server for
1303 version 4 of the NFS protocol (RFC 3530).
1304
1305 To export files using NFSv4, you need to install additional user
1306 space programs which can be found in the Linux nfs-utils package,
1307 available from http://linux-nfs.org/.
1308
1309 If unsure, say N.
1310 246
1311config LOCKD 247config LOCKD
1312 tristate 248 tristate
@@ -1328,221 +264,13 @@ config NFS_COMMON
1328 depends on NFSD || NFS_FS 264 depends on NFSD || NFS_FS
1329 default y 265 default y
1330 266
1331config SUNRPC 267source "net/sunrpc/Kconfig"
1332 tristate 268source "fs/smbfs/Kconfig"
1333
1334config SUNRPC_GSS
1335 tristate
1336
1337config SUNRPC_XPRT_RDMA
1338 tristate
1339 depends on SUNRPC && INFINIBAND && EXPERIMENTAL
1340 default SUNRPC && INFINIBAND
1341 help
1342 This option enables an RPC client transport capability that
1343 allows the NFS client to mount servers via an RDMA-enabled
1344 transport.
1345
1346 To compile RPC client RDMA transport support as a module,
1347 choose M here: the module will be called xprtrdma.
1348
1349 If unsure, say N.
1350
1351config SUNRPC_REGISTER_V4
1352 bool "Register local RPC services via rpcbind v4 (EXPERIMENTAL)"
1353 depends on SUNRPC && EXPERIMENTAL
1354 default n
1355 help
1356 Sun added support for registering RPC services at an IPv6
1357 address by creating two new versions of the rpcbind protocol
1358 (RFC 1833).
1359
1360 This option enables support in the kernel RPC server for
1361 registering kernel RPC services via version 4 of the rpcbind
1362 protocol. If you enable this option, you must run a portmapper
1363 daemon that supports rpcbind protocol version 4.
1364
1365 Serving NFS over IPv6 from knfsd (the kernel's NFS server)
1366 requires that you enable this option and use a portmapper that
1367 supports rpcbind version 4.
1368
1369 If unsure, say N to get traditional behavior (register kernel
1370 RPC services using only rpcbind version 2). Distributions
1371 using the legacy Linux portmapper daemon must say N here.
1372
1373config RPCSEC_GSS_KRB5
1374 tristate "Secure RPC: Kerberos V mechanism (EXPERIMENTAL)"
1375 depends on SUNRPC && EXPERIMENTAL
1376 select SUNRPC_GSS
1377 select CRYPTO
1378 select CRYPTO_MD5
1379 select CRYPTO_DES
1380 select CRYPTO_CBC
1381 help
1382 Choose Y here to enable Secure RPC using the Kerberos version 5
1383 GSS-API mechanism (RFC 1964).
1384
1385 Secure RPC calls with Kerberos require an auxiliary user-space
1386 daemon which may be found in the Linux nfs-utils package
1387 available from http://linux-nfs.org/. In addition, user-space
1388 Kerberos support should be installed.
1389
1390 If unsure, say N.
1391
1392config RPCSEC_GSS_SPKM3
1393 tristate "Secure RPC: SPKM3 mechanism (EXPERIMENTAL)"
1394 depends on SUNRPC && EXPERIMENTAL
1395 select SUNRPC_GSS
1396 select CRYPTO
1397 select CRYPTO_MD5
1398 select CRYPTO_DES
1399 select CRYPTO_CAST5
1400 select CRYPTO_CBC
1401 help
1402 Choose Y here to enable Secure RPC using the SPKM3 public key
1403 GSS-API mechansim (RFC 2025).
1404
1405 Secure RPC calls with SPKM3 require an auxiliary userspace
1406 daemon which may be found in the Linux nfs-utils package
1407 available from http://linux-nfs.org/.
1408
1409 If unsure, say N.
1410
1411config SMB_FS
1412 tristate "SMB file system support (OBSOLETE, please use CIFS)"
1413 depends on INET
1414 select NLS
1415 help
1416 SMB (Server Message Block) is the protocol Windows for Workgroups
1417 (WfW), Windows 95/98, Windows NT and OS/2 Lan Manager use to share
1418 files and printers over local networks. Saying Y here allows you to
1419 mount their file systems (often called "shares" in this context) and
1420 access them just like any other Unix directory. Currently, this
1421 works only if the Windows machines use TCP/IP as the underlying
1422 transport protocol, and not NetBEUI. For details, read
1423 <file:Documentation/filesystems/smbfs.txt> and the SMB-HOWTO,
1424 available from <http://www.tldp.org/docs.html#howto>.
1425
1426 Note: if you just want your box to act as an SMB *server* and make
1427 files and printing services available to Windows clients (which need
1428 to have a TCP/IP stack), you don't need to say Y here; you can use
1429 the program SAMBA (available from <ftp://ftp.samba.org/pub/samba/>)
1430 for that.
1431
1432 General information about how to connect Linux, Windows machines and
1433 Macs is on the WWW at <http://www.eats.com/linux_mac_win.html>.
1434
1435 To compile the SMB support as a module, choose M here:
1436 the module will be called smbfs. Most people say N, however.
1437
1438config SMB_NLS_DEFAULT
1439 bool "Use a default NLS"
1440 depends on SMB_FS
1441 help
1442 Enabling this will make smbfs use nls translations by default. You
1443 need to specify the local charset (CONFIG_NLS_DEFAULT) in the nls
1444 settings and you need to give the default nls for the SMB server as
1445 CONFIG_SMB_NLS_REMOTE.
1446
1447 The nls settings can be changed at mount time, if your smbmount
1448 supports that, using the codepage and iocharset parameters.
1449
1450 smbmount from samba 2.2.0 or later supports this.
1451
1452config SMB_NLS_REMOTE
1453 string "Default Remote NLS Option"
1454 depends on SMB_NLS_DEFAULT
1455 default "cp437"
1456 help
1457 This setting allows you to specify a default value for which
1458 codepage the server uses. If this field is left blank no
1459 translations will be done by default. The local codepage/charset
1460 default to CONFIG_NLS_DEFAULT.
1461
1462 The nls settings can be changed at mount time, if your smbmount
1463 supports that, using the codepage and iocharset parameters.
1464
1465 smbmount from samba 2.2.0 or later supports this.
1466
1467source "fs/cifs/Kconfig" 269source "fs/cifs/Kconfig"
1468
1469config NCP_FS
1470 tristate "NCP file system support (to mount NetWare volumes)"
1471 depends on IPX!=n || INET
1472 help
1473 NCP (NetWare Core Protocol) is a protocol that runs over IPX and is
1474 used by Novell NetWare clients to talk to file servers. It is to
1475 IPX what NFS is to TCP/IP, if that helps. Saying Y here allows you
1476 to mount NetWare file server volumes and to access them just like
1477 any other Unix directory. For details, please read the file
1478 <file:Documentation/filesystems/ncpfs.txt> in the kernel source and
1479 the IPX-HOWTO from <http://www.tldp.org/docs.html#howto>.
1480
1481 You do not have to say Y here if you want your Linux box to act as a
1482 file *server* for Novell NetWare clients.
1483
1484 General information about how to connect Linux, Windows machines and
1485 Macs is on the WWW at <http://www.eats.com/linux_mac_win.html>.
1486
1487 To compile this as a module, choose M here: the module will be called
1488 ncpfs. Say N unless you are connected to a Novell network.
1489
1490source "fs/ncpfs/Kconfig" 270source "fs/ncpfs/Kconfig"
1491 271source "fs/coda/Kconfig"
1492config CODA_FS 272source "fs/afs/Kconfig"
1493 tristate "Coda file system support (advanced network fs)" 273source "fs/9p/Kconfig"
1494 depends on INET
1495 help
1496 Coda is an advanced network file system, similar to NFS in that it
1497 enables you to mount file systems of a remote server and access them
1498 with regular Unix commands as if they were sitting on your hard
1499 disk. Coda has several advantages over NFS: support for
1500 disconnected operation (e.g. for laptops), read/write server
1501 replication, security model for authentication and encryption,
1502 persistent client caches and write back caching.
1503
1504 If you say Y here, your Linux box will be able to act as a Coda
1505 *client*. You will need user level code as well, both for the
1506 client and server. Servers are currently user level, i.e. they need
1507 no kernel support. Please read
1508 <file:Documentation/filesystems/coda.txt> and check out the Coda
1509 home page <http://www.coda.cs.cmu.edu/>.
1510
1511 To compile the coda client support as a module, choose M here: the
1512 module will be called coda.
1513
1514config AFS_FS
1515 tristate "Andrew File System support (AFS) (EXPERIMENTAL)"
1516 depends on INET && EXPERIMENTAL
1517 select AF_RXRPC
1518 help
1519 If you say Y here, you will get an experimental Andrew File System
1520 driver. It currently only supports unsecured read-only AFS access.
1521
1522 See <file:Documentation/filesystems/afs.txt> for more information.
1523
1524 If unsure, say N.
1525
1526config AFS_DEBUG
1527 bool "AFS dynamic debugging"
1528 depends on AFS_FS
1529 help
1530 Say Y here to make runtime controllable debugging messages appear.
1531
1532 See <file:Documentation/filesystems/afs.txt> for more information.
1533
1534 If unsure, say N.
1535
1536config 9P_FS
1537 tristate "Plan 9 Resource Sharing Support (9P2000) (Experimental)"
1538 depends on INET && NET_9P && EXPERIMENTAL
1539 help
1540 If you say Y here, you will get experimental support for
1541 Plan 9 resource sharing via the 9P2000 protocol.
1542
1543 See <http://v9fs.sf.net> for more information.
1544
1545 If unsure, say N.
1546 274
1547endif # NETWORK_FILESYSTEMS 275endif # NETWORK_FILESYSTEMS
1548 276
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index ce9fb3fbfae4..bb4cc5b8abc8 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -43,7 +43,7 @@ config BINFMT_ELF_FDPIC
43config CORE_DUMP_DEFAULT_ELF_HEADERS 43config CORE_DUMP_DEFAULT_ELF_HEADERS
44 bool "Write ELF core dumps with partial segments" 44 bool "Write ELF core dumps with partial segments"
45 default n 45 default n
46 depends on BINFMT_ELF 46 depends on BINFMT_ELF && ELF_CORE
47 help 47 help
48 ELF core dump files describe each memory mapping of the crashed 48 ELF core dump files describe each memory mapping of the crashed
49 process, and can contain or omit the memory contents of each one. 49 process, and can contain or omit the memory contents of each one.
diff --git a/fs/Makefile b/fs/Makefile
index d9f8afe6f0c4..38bc735c67ad 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -20,8 +20,7 @@ obj-y += no-block.o
20endif 20endif
21 21
22obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o 22obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o
23obj-$(CONFIG_INOTIFY) += inotify.o 23obj-y += notify/
24obj-$(CONFIG_INOTIFY_USER) += inotify_user.o
25obj-$(CONFIG_EPOLL) += eventpoll.o 24obj-$(CONFIG_EPOLL) += eventpoll.o
26obj-$(CONFIG_ANON_INODES) += anon_inodes.o 25obj-$(CONFIG_ANON_INODES) += anon_inodes.o
27obj-$(CONFIG_SIGNALFD) += signalfd.o 26obj-$(CONFIG_SIGNALFD) += signalfd.o
@@ -55,10 +54,9 @@ obj-$(CONFIG_GENERIC_ACL) += generic_acl.o
55obj-$(CONFIG_QUOTA) += dquot.o 54obj-$(CONFIG_QUOTA) += dquot.o
56obj-$(CONFIG_QFMT_V1) += quota_v1.o 55obj-$(CONFIG_QFMT_V1) += quota_v1.o
57obj-$(CONFIG_QFMT_V2) += quota_v2.o 56obj-$(CONFIG_QFMT_V2) += quota_v2.o
57obj-$(CONFIG_QUOTA_TREE) += quota_tree.o
58obj-$(CONFIG_QUOTACTL) += quota.o 58obj-$(CONFIG_QUOTACTL) += quota.o
59 59
60obj-$(CONFIG_DNOTIFY) += dnotify.o
61
62obj-$(CONFIG_PROC_FS) += proc/ 60obj-$(CONFIG_PROC_FS) += proc/
63obj-y += partitions/ 61obj-y += partitions/
64obj-$(CONFIG_SYSFS) += sysfs/ 62obj-$(CONFIG_SYSFS) += sysfs/
@@ -76,6 +74,7 @@ obj-$(CONFIG_JBD) += jbd/
76obj-$(CONFIG_JBD2) += jbd2/ 74obj-$(CONFIG_JBD2) += jbd2/
77obj-$(CONFIG_EXT2_FS) += ext2/ 75obj-$(CONFIG_EXT2_FS) += ext2/
78obj-$(CONFIG_CRAMFS) += cramfs/ 76obj-$(CONFIG_CRAMFS) += cramfs/
77obj-$(CONFIG_SQUASHFS) += squashfs/
79obj-y += ramfs/ 78obj-y += ramfs/
80obj-$(CONFIG_HUGETLBFS) += hugetlbfs/ 79obj-$(CONFIG_HUGETLBFS) += hugetlbfs/
81obj-$(CONFIG_CODA_FS) += coda/ 80obj-$(CONFIG_CODA_FS) += coda/
@@ -121,4 +120,5 @@ obj-$(CONFIG_HOSTFS) += hostfs/
121obj-$(CONFIG_HPPFS) += hppfs/ 120obj-$(CONFIG_HPPFS) += hppfs/
122obj-$(CONFIG_DEBUG_FS) += debugfs/ 121obj-$(CONFIG_DEBUG_FS) += debugfs/
123obj-$(CONFIG_OCFS2_FS) += ocfs2/ 122obj-$(CONFIG_OCFS2_FS) += ocfs2/
123obj-$(CONFIG_BTRFS_FS) += btrfs/
124obj-$(CONFIG_GFS2_FS) += gfs2/ 124obj-$(CONFIG_GFS2_FS) += gfs2/
diff --git a/fs/adfs/Kconfig b/fs/adfs/Kconfig
new file mode 100644
index 000000000000..e55182a74605
--- /dev/null
+++ b/fs/adfs/Kconfig
@@ -0,0 +1,27 @@
1config ADFS_FS
2 tristate "ADFS file system support (EXPERIMENTAL)"
3 depends on BLOCK && EXPERIMENTAL
4 help
5 The Acorn Disc Filing System is the standard file system of the
6 RiscOS operating system which runs on Acorn's ARM-based Risc PC
7 systems and the Acorn Archimedes range of machines. If you say Y
8 here, Linux will be able to read from ADFS partitions on hard drives
9 and from ADFS-formatted floppy discs. If you also want to be able to
10 write to those devices, say Y to "ADFS write support" below.
11
12 The ADFS partition should be the first partition (i.e.,
13 /dev/[hs]d?1) on each of your drives. Please read the file
14 <file:Documentation/filesystems/adfs.txt> for further details.
15
16 To compile this code as a module, choose M here: the module will be
17 called adfs.
18
19 If unsure, say N.
20
21config ADFS_FS_RW
22 bool "ADFS write support (DANGEROUS)"
23 depends on ADFS_FS
24 help
25 If you say Y here, you will be able to write to ADFS partitions on
26 hard drives and ADFS-formatted floppy disks. This is experimental
27 codes, so if you're unsure, say N.
diff --git a/fs/affs/Kconfig b/fs/affs/Kconfig
new file mode 100644
index 000000000000..cfad9afb4762
--- /dev/null
+++ b/fs/affs/Kconfig
@@ -0,0 +1,21 @@
1config AFFS_FS
2 tristate "Amiga FFS file system support (EXPERIMENTAL)"
3 depends on BLOCK && EXPERIMENTAL
4 help
5 The Fast File System (FFS) is the common file system used on hard
6 disks by Amiga(tm) systems since AmigaOS Version 1.3 (34.20). Say Y
7 if you want to be able to read and write files from and to an Amiga
8 FFS partition on your hard drive. Amiga floppies however cannot be
9 read with this driver due to an incompatibility of the floppy
10 controller used in an Amiga and the standard floppy controller in
11 PCs and workstations. Read <file:Documentation/filesystems/affs.txt>
12 and <file:fs/affs/Changes>.
13
14 With this driver you can also mount disk files used by Bernd
15 Schmidt's Un*X Amiga Emulator
16 (<http://www.freiburg.linux.de/~uae/>).
17 If you want to do this, you will also need to say Y or M to "Loop
18 device support", above.
19
20 To compile this file system support as a module, choose M here: the
21 module will be called affs. If unsure, say N.
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 1377b1240b6e..9246cb4aa018 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -628,7 +628,7 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping
628 } 628 }
629 629
630 index = pos >> PAGE_CACHE_SHIFT; 630 index = pos >> PAGE_CACHE_SHIFT;
631 page = __grab_cache_page(mapping, index); 631 page = grab_cache_page_write_begin(mapping, index, flags);
632 if (!page) 632 if (!page)
633 return -ENOMEM; 633 return -ENOMEM;
634 *pagep = page; 634 *pagep = page;
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 415d9c67ac16..3c4ec7d864c4 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -119,8 +119,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino)
119 goto bad_inode; 119 goto bad_inode;
120#else 120#else
121 inode->i_mode |= S_IFDIR; 121 inode->i_mode |= S_IFDIR;
122 inode->i_op = NULL; 122 /* ... and leave ->i_op and ->i_fop pointing to empty */
123 inode->i_fop = NULL;
124 break; 123 break;
125#endif 124#endif
126 case ST_LINKFILE: 125 case ST_LINKFILE:
diff --git a/fs/afs/Kconfig b/fs/afs/Kconfig
new file mode 100644
index 000000000000..e7b522fe15e1
--- /dev/null
+++ b/fs/afs/Kconfig
@@ -0,0 +1,21 @@
1config AFS_FS
2 tristate "Andrew File System support (AFS) (EXPERIMENTAL)"
3 depends on INET && EXPERIMENTAL
4 select AF_RXRPC
5 help
6 If you say Y here, you will get an experimental Andrew File System
7 driver. It currently only supports unsecured read-only AFS access.
8
9 See <file:Documentation/filesystems/afs.txt> for more information.
10
11 If unsure, say N.
12
13config AFS_DEBUG
14 bool "AFS dynamic debugging"
15 depends on AFS_FS
16 help
17 Say Y here to make runtime controllable debugging messages appear.
18
19 See <file:Documentation/filesystems/afs.txt> for more information.
20
21 If unsure, say N.
diff --git a/fs/afs/write.c b/fs/afs/write.c
index d6b85dab35fc..3fb36d433621 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -144,7 +144,7 @@ int afs_write_begin(struct file *file, struct address_space *mapping,
144 candidate->state = AFS_WBACK_PENDING; 144 candidate->state = AFS_WBACK_PENDING;
145 init_waitqueue_head(&candidate->waitq); 145 init_waitqueue_head(&candidate->waitq);
146 146
147 page = __grab_cache_page(mapping, index); 147 page = grab_cache_page_write_begin(mapping, index, flags);
148 if (!page) { 148 if (!page) {
149 kfree(candidate); 149 kfree(candidate);
150 return -ENOMEM; 150 return -ENOMEM;
diff --git a/fs/aio.c b/fs/aio.c
index d6f89d3c15e8..8fa77e233944 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1270,7 +1270,7 @@ static void io_destroy(struct kioctx *ioctx)
1270 * pointer is passed for ctxp. Will fail with -ENOSYS if not 1270 * pointer is passed for ctxp. Will fail with -ENOSYS if not
1271 * implemented. 1271 * implemented.
1272 */ 1272 */
1273asmlinkage long sys_io_setup(unsigned nr_events, aio_context_t __user *ctxp) 1273SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp)
1274{ 1274{
1275 struct kioctx *ioctx = NULL; 1275 struct kioctx *ioctx = NULL;
1276 unsigned long ctx; 1276 unsigned long ctx;
@@ -1308,7 +1308,7 @@ out:
1308 * implemented. May fail with -EFAULT if the context pointed to 1308 * implemented. May fail with -EFAULT if the context pointed to
1309 * is invalid. 1309 * is invalid.
1310 */ 1310 */
1311asmlinkage long sys_io_destroy(aio_context_t ctx) 1311SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
1312{ 1312{
1313 struct kioctx *ioctx = lookup_ioctx(ctx); 1313 struct kioctx *ioctx = lookup_ioctx(ctx);
1314 if (likely(NULL != ioctx)) { 1314 if (likely(NULL != ioctx)) {
@@ -1662,8 +1662,8 @@ out_put_req:
1662 * are available to queue any iocbs. Will return 0 if nr is 0. Will 1662 * are available to queue any iocbs. Will return 0 if nr is 0. Will
1663 * fail with -ENOSYS if not implemented. 1663 * fail with -ENOSYS if not implemented.
1664 */ 1664 */
1665asmlinkage long sys_io_submit(aio_context_t ctx_id, long nr, 1665SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
1666 struct iocb __user * __user *iocbpp) 1666 struct iocb __user * __user *, iocbpp)
1667{ 1667{
1668 struct kioctx *ctx; 1668 struct kioctx *ctx;
1669 long ret = 0; 1669 long ret = 0;
@@ -1737,8 +1737,8 @@ static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb,
1737 * invalid. May fail with -EAGAIN if the iocb specified was not 1737 * invalid. May fail with -EAGAIN if the iocb specified was not
1738 * cancelled. Will fail with -ENOSYS if not implemented. 1738 * cancelled. Will fail with -ENOSYS if not implemented.
1739 */ 1739 */
1740asmlinkage long sys_io_cancel(aio_context_t ctx_id, struct iocb __user *iocb, 1740SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
1741 struct io_event __user *result) 1741 struct io_event __user *, result)
1742{ 1742{
1743 int (*cancel)(struct kiocb *iocb, struct io_event *res); 1743 int (*cancel)(struct kiocb *iocb, struct io_event *res);
1744 struct kioctx *ctx; 1744 struct kioctx *ctx;
@@ -1799,11 +1799,11 @@ asmlinkage long sys_io_cancel(aio_context_t ctx_id, struct iocb __user *iocb,
1799 * will be updated if not NULL and the operation blocks. Will fail 1799 * will be updated if not NULL and the operation blocks. Will fail
1800 * with -ENOSYS if not implemented. 1800 * with -ENOSYS if not implemented.
1801 */ 1801 */
1802asmlinkage long sys_io_getevents(aio_context_t ctx_id, 1802SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
1803 long min_nr, 1803 long, min_nr,
1804 long nr, 1804 long, nr,
1805 struct io_event __user *events, 1805 struct io_event __user *, events,
1806 struct timespec __user *timeout) 1806 struct timespec __user *, timeout)
1807{ 1807{
1808 struct kioctx *ioctx = lookup_ioctx(ctx_id); 1808 struct kioctx *ioctx = lookup_ioctx(ctx_id);
1809 long ret = -EINVAL; 1809 long ret = -EINVAL;
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index c16d9be1b017..3bbdb9d02376 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -79,9 +79,12 @@ int anon_inode_getfd(const char *name, const struct file_operations *fops,
79 if (IS_ERR(anon_inode_inode)) 79 if (IS_ERR(anon_inode_inode))
80 return -ENODEV; 80 return -ENODEV;
81 81
82 if (fops->owner && !try_module_get(fops->owner))
83 return -ENOENT;
84
82 error = get_unused_fd_flags(flags); 85 error = get_unused_fd_flags(flags);
83 if (error < 0) 86 if (error < 0)
84 return error; 87 goto err_module;
85 fd = error; 88 fd = error;
86 89
87 /* 90 /*
@@ -128,6 +131,8 @@ err_dput:
128 dput(dentry); 131 dput(dentry);
129err_put_unused_fd: 132err_put_unused_fd:
130 put_unused_fd(fd); 133 put_unused_fd(fd);
134err_module:
135 module_put(fops->owner);
131 return error; 136 return error;
132} 137}
133EXPORT_SYMBOL_GPL(anon_inode_getfd); 138EXPORT_SYMBOL_GPL(anon_inode_getfd);
diff --git a/fs/autofs/Kconfig b/fs/autofs/Kconfig
new file mode 100644
index 000000000000..5f3bea90911e
--- /dev/null
+++ b/fs/autofs/Kconfig
@@ -0,0 +1,21 @@
1config AUTOFS_FS
2 tristate "Kernel automounter support"
3 help
4 The automounter is a tool to automatically mount remote file systems
5 on demand. This implementation is partially kernel-based to reduce
6 overhead in the already-mounted case; this is unlike the BSD
7 automounter (amd), which is a pure user space daemon.
8
9 To use the automounter you need the user-space tools from the autofs
10 package; you can find the location in <file:Documentation/Changes>.
11 You also want to answer Y to "NFS file system support", below.
12
13 If you want to use the newer version of the automounter with more
14 features, say N here and say Y to "Kernel automounter v4 support",
15 below.
16
17 To compile this support as a module, choose M here: the module will be
18 called autofs.
19
20 If you are not a part of a fairly large, distributed network, you
21 probably do not need an automounter, and can say N here.
diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
index c773680d5c60..e1734f2d6e26 100644
--- a/fs/autofs/inode.c
+++ b/fs/autofs/inode.c
@@ -251,13 +251,11 @@ struct inode *autofs_iget(struct super_block *sb, unsigned long ino)
251 inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; 251 inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
252 inode->i_nlink = 2; 252 inode->i_nlink = 2;
253 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 253 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
254 inode->i_blocks = 0;
255 254
256 if (ino == AUTOFS_ROOT_INO) { 255 if (ino == AUTOFS_ROOT_INO) {
257 inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR; 256 inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR;
258 inode->i_op = &autofs_root_inode_operations; 257 inode->i_op = &autofs_root_inode_operations;
259 inode->i_fop = &autofs_root_operations; 258 inode->i_fop = &autofs_root_operations;
260 inode->i_uid = inode->i_gid = 0; /* Changed in read_super */
261 goto done; 259 goto done;
262 } 260 }
263 261
diff --git a/fs/autofs4/Kconfig b/fs/autofs4/Kconfig
new file mode 100644
index 000000000000..1204d6384d39
--- /dev/null
+++ b/fs/autofs4/Kconfig
@@ -0,0 +1,20 @@
1config AUTOFS4_FS
2 tristate "Kernel automounter version 4 support (also supports v3)"
3 help
4 The automounter is a tool to automatically mount remote file systems
5 on demand. This implementation is partially kernel-based to reduce
6 overhead in the already-mounted case; this is unlike the BSD
7 automounter (amd), which is a pure user space daemon.
8
9 To use the automounter you need the user-space tools from
10 <ftp://ftp.kernel.org/pub/linux/daemons/autofs/v4/>; you also
11 want to answer Y to "NFS file system support", below.
12
13 To compile this support as a module, choose M here: the module will be
14 called autofs4. You will need to add "alias autofs autofs4" to your
15 modules configuration file.
16
17 If you are not a part of a fairly large, distributed network or
18 don't have a laptop which needs to dynamically reconfigure to the
19 local network, you probably do not need an automounter, and can say
20 N here.
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index e0f16da00e54..a76803108d06 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -25,8 +25,6 @@
25#define AUTOFS_DEV_IOCTL_IOC_FIRST (AUTOFS_DEV_IOCTL_VERSION) 25#define AUTOFS_DEV_IOCTL_IOC_FIRST (AUTOFS_DEV_IOCTL_VERSION)
26#define AUTOFS_DEV_IOCTL_IOC_COUNT (AUTOFS_IOC_COUNT - 11) 26#define AUTOFS_DEV_IOCTL_IOC_COUNT (AUTOFS_IOC_COUNT - 11)
27 27
28#define AUTOFS_TYPE_TRIGGER (AUTOFS_TYPE_DIRECT|AUTOFS_TYPE_OFFSET)
29
30#include <linux/kernel.h> 28#include <linux/kernel.h>
31#include <linux/slab.h> 29#include <linux/slab.h>
32#include <linux/time.h> 30#include <linux/time.h>
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 63b7c7afe8df..025e105bffea 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -124,7 +124,7 @@ static inline void free_dev_ioctl(struct autofs_dev_ioctl *param)
124 124
125/* 125/*
126 * Check sanity of parameter control fields and if a path is present 126 * Check sanity of parameter control fields and if a path is present
127 * check that it has a "/" and is terminated. 127 * check that it is terminated and contains at least one "/".
128 */ 128 */
129static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param) 129static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param)
130{ 130{
@@ -138,15 +138,16 @@ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param)
138 } 138 }
139 139
140 if (param->size > sizeof(*param)) { 140 if (param->size > sizeof(*param)) {
141 err = check_name(param->path); 141 err = invalid_str(param->path,
142 (void *) ((size_t) param + param->size));
142 if (err) { 143 if (err) {
143 AUTOFS_WARN("invalid path supplied for cmd(0x%08x)", 144 AUTOFS_WARN(
144 cmd); 145 "path string terminator missing for cmd(0x%08x)",
146 cmd);
145 goto out; 147 goto out;
146 } 148 }
147 149
148 err = invalid_str(param->path, 150 err = check_name(param->path);
149 (void *) ((size_t) param + param->size));
150 if (err) { 151 if (err) {
151 AUTOFS_WARN("invalid path supplied for cmd(0x%08x)", 152 AUTOFS_WARN("invalid path supplied for cmd(0x%08x)",
152 cmd); 153 cmd);
@@ -180,7 +181,7 @@ static int autofs_dev_ioctl_protover(struct file *fp,
180 struct autofs_sb_info *sbi, 181 struct autofs_sb_info *sbi,
181 struct autofs_dev_ioctl *param) 182 struct autofs_dev_ioctl *param)
182{ 183{
183 param->arg1 = sbi->version; 184 param->protover.version = sbi->version;
184 return 0; 185 return 0;
185} 186}
186 187
@@ -189,7 +190,7 @@ static int autofs_dev_ioctl_protosubver(struct file *fp,
189 struct autofs_sb_info *sbi, 190 struct autofs_sb_info *sbi,
190 struct autofs_dev_ioctl *param) 191 struct autofs_dev_ioctl *param)
191{ 192{
192 param->arg1 = sbi->sub_version; 193 param->protosubver.sub_version = sbi->sub_version;
193 return 0; 194 return 0;
194} 195}
195 196
@@ -335,13 +336,13 @@ static int autofs_dev_ioctl_openmount(struct file *fp,
335 int err, fd; 336 int err, fd;
336 337
337 /* param->path has already been checked */ 338 /* param->path has already been checked */
338 if (!param->arg1) 339 if (!param->openmount.devid)
339 return -EINVAL; 340 return -EINVAL;
340 341
341 param->ioctlfd = -1; 342 param->ioctlfd = -1;
342 343
343 path = param->path; 344 path = param->path;
344 devid = param->arg1; 345 devid = param->openmount.devid;
345 346
346 err = 0; 347 err = 0;
347 fd = autofs_dev_ioctl_open_mountpoint(path, devid); 348 fd = autofs_dev_ioctl_open_mountpoint(path, devid);
@@ -373,7 +374,7 @@ static int autofs_dev_ioctl_ready(struct file *fp,
373{ 374{
374 autofs_wqt_t token; 375 autofs_wqt_t token;
375 376
376 token = (autofs_wqt_t) param->arg1; 377 token = (autofs_wqt_t) param->ready.token;
377 return autofs4_wait_release(sbi, token, 0); 378 return autofs4_wait_release(sbi, token, 0);
378} 379}
379 380
@@ -388,8 +389,8 @@ static int autofs_dev_ioctl_fail(struct file *fp,
388 autofs_wqt_t token; 389 autofs_wqt_t token;
389 int status; 390 int status;
390 391
391 token = (autofs_wqt_t) param->arg1; 392 token = (autofs_wqt_t) param->fail.token;
392 status = param->arg2 ? param->arg2 : -ENOENT; 393 status = param->fail.status ? param->fail.status : -ENOENT;
393 return autofs4_wait_release(sbi, token, status); 394 return autofs4_wait_release(sbi, token, status);
394} 395}
395 396
@@ -412,10 +413,10 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp,
412 int pipefd; 413 int pipefd;
413 int err = 0; 414 int err = 0;
414 415
415 if (param->arg1 == -1) 416 if (param->setpipefd.pipefd == -1)
416 return -EINVAL; 417 return -EINVAL;
417 418
418 pipefd = param->arg1; 419 pipefd = param->setpipefd.pipefd;
419 420
420 mutex_lock(&sbi->wq_mutex); 421 mutex_lock(&sbi->wq_mutex);
421 if (!sbi->catatonic) { 422 if (!sbi->catatonic) {
@@ -457,8 +458,8 @@ static int autofs_dev_ioctl_timeout(struct file *fp,
457{ 458{
458 unsigned long timeout; 459 unsigned long timeout;
459 460
460 timeout = param->arg1; 461 timeout = param->timeout.timeout;
461 param->arg1 = sbi->exp_timeout / HZ; 462 param->timeout.timeout = sbi->exp_timeout / HZ;
462 sbi->exp_timeout = timeout * HZ; 463 sbi->exp_timeout = timeout * HZ;
463 return 0; 464 return 0;
464} 465}
@@ -489,7 +490,7 @@ static int autofs_dev_ioctl_requester(struct file *fp,
489 path = param->path; 490 path = param->path;
490 devid = sbi->sb->s_dev; 491 devid = sbi->sb->s_dev;
491 492
492 param->arg1 = param->arg2 = -1; 493 param->requester.uid = param->requester.gid = -1;
493 494
494 /* Get nameidata of the parent directory */ 495 /* Get nameidata of the parent directory */
495 err = path_lookup(path, LOOKUP_PARENT, &nd); 496 err = path_lookup(path, LOOKUP_PARENT, &nd);
@@ -505,8 +506,8 @@ static int autofs_dev_ioctl_requester(struct file *fp,
505 err = 0; 506 err = 0;
506 autofs4_expire_wait(nd.path.dentry); 507 autofs4_expire_wait(nd.path.dentry);
507 spin_lock(&sbi->fs_lock); 508 spin_lock(&sbi->fs_lock);
508 param->arg1 = ino->uid; 509 param->requester.uid = ino->uid;
509 param->arg2 = ino->gid; 510 param->requester.gid = ino->gid;
510 spin_unlock(&sbi->fs_lock); 511 spin_unlock(&sbi->fs_lock);
511 } 512 }
512 513
@@ -529,10 +530,10 @@ static int autofs_dev_ioctl_expire(struct file *fp,
529 int err = -EAGAIN; 530 int err = -EAGAIN;
530 int how; 531 int how;
531 532
532 how = param->arg1; 533 how = param->expire.how;
533 mnt = fp->f_path.mnt; 534 mnt = fp->f_path.mnt;
534 535
535 if (sbi->type & AUTOFS_TYPE_TRIGGER) 536 if (autofs_type_trigger(sbi->type))
536 dentry = autofs4_expire_direct(sbi->sb, mnt, sbi, how); 537 dentry = autofs4_expire_direct(sbi->sb, mnt, sbi, how);
537 else 538 else
538 dentry = autofs4_expire_indirect(sbi->sb, mnt, sbi, how); 539 dentry = autofs4_expire_indirect(sbi->sb, mnt, sbi, how);
@@ -565,9 +566,9 @@ static int autofs_dev_ioctl_askumount(struct file *fp,
565 struct autofs_sb_info *sbi, 566 struct autofs_sb_info *sbi,
566 struct autofs_dev_ioctl *param) 567 struct autofs_dev_ioctl *param)
567{ 568{
568 param->arg1 = 0; 569 param->askumount.may_umount = 0;
569 if (may_umount(fp->f_path.mnt)) 570 if (may_umount(fp->f_path.mnt))
570 param->arg1 = 1; 571 param->askumount.may_umount = 1;
571 return 0; 572 return 0;
572} 573}
573 574
@@ -600,6 +601,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
600 struct nameidata nd; 601 struct nameidata nd;
601 const char *path; 602 const char *path;
602 unsigned int type; 603 unsigned int type;
604 unsigned int devid, magic;
603 int err = -ENOENT; 605 int err = -ENOENT;
604 606
605 if (param->size <= sizeof(*param)) { 607 if (param->size <= sizeof(*param)) {
@@ -608,13 +610,13 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
608 } 610 }
609 611
610 path = param->path; 612 path = param->path;
611 type = param->arg1; 613 type = param->ismountpoint.in.type;
612 614
613 param->arg1 = 0; 615 param->ismountpoint.out.devid = devid = 0;
614 param->arg2 = 0; 616 param->ismountpoint.out.magic = magic = 0;
615 617
616 if (!fp || param->ioctlfd == -1) { 618 if (!fp || param->ioctlfd == -1) {
617 if (type == AUTOFS_TYPE_ANY) { 619 if (autofs_type_any(type)) {
618 struct super_block *sb; 620 struct super_block *sb;
619 621
620 err = path_lookup(path, LOOKUP_FOLLOW, &nd); 622 err = path_lookup(path, LOOKUP_FOLLOW, &nd);
@@ -622,7 +624,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
622 goto out; 624 goto out;
623 625
624 sb = nd.path.dentry->d_sb; 626 sb = nd.path.dentry->d_sb;
625 param->arg1 = new_encode_dev(sb->s_dev); 627 devid = new_encode_dev(sb->s_dev);
626 } else { 628 } else {
627 struct autofs_info *ino; 629 struct autofs_info *ino;
628 630
@@ -635,38 +637,41 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
635 goto out_release; 637 goto out_release;
636 638
637 ino = autofs4_dentry_ino(nd.path.dentry); 639 ino = autofs4_dentry_ino(nd.path.dentry);
638 param->arg1 = autofs4_get_dev(ino->sbi); 640 devid = autofs4_get_dev(ino->sbi);
639 } 641 }
640 642
641 err = 0; 643 err = 0;
642 if (nd.path.dentry->d_inode && 644 if (nd.path.dentry->d_inode &&
643 nd.path.mnt->mnt_root == nd.path.dentry) { 645 nd.path.mnt->mnt_root == nd.path.dentry) {
644 err = 1; 646 err = 1;
645 param->arg2 = nd.path.dentry->d_inode->i_sb->s_magic; 647 magic = nd.path.dentry->d_inode->i_sb->s_magic;
646 } 648 }
647 } else { 649 } else {
648 dev_t devid = new_encode_dev(sbi->sb->s_dev); 650 dev_t dev = autofs4_get_dev(sbi);
649 651
650 err = path_lookup(path, LOOKUP_PARENT, &nd); 652 err = path_lookup(path, LOOKUP_PARENT, &nd);
651 if (err) 653 if (err)
652 goto out; 654 goto out;
653 655
654 err = autofs_dev_ioctl_find_super(&nd, devid); 656 err = autofs_dev_ioctl_find_super(&nd, dev);
655 if (err) 657 if (err)
656 goto out_release; 658 goto out_release;
657 659
658 param->arg1 = autofs4_get_dev(sbi); 660 devid = dev;
659 661
660 err = have_submounts(nd.path.dentry); 662 err = have_submounts(nd.path.dentry);
661 663
662 if (nd.path.mnt->mnt_mountpoint != nd.path.mnt->mnt_root) { 664 if (nd.path.mnt->mnt_mountpoint != nd.path.mnt->mnt_root) {
663 if (follow_down(&nd.path.mnt, &nd.path.dentry)) { 665 if (follow_down(&nd.path.mnt, &nd.path.dentry)) {
664 struct inode *inode = nd.path.dentry->d_inode; 666 struct inode *inode = nd.path.dentry->d_inode;
665 param->arg2 = inode->i_sb->s_magic; 667 magic = inode->i_sb->s_magic;
666 } 668 }
667 } 669 }
668 } 670 }
669 671
672 param->ismountpoint.out.devid = devid;
673 param->ismountpoint.out.magic = magic;
674
670out_release: 675out_release:
671 path_put(&nd.path); 676 path_put(&nd.path);
672out: 677out:
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 4b6fb3f628c0..e3bd50776f9e 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -63,7 +63,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
63 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); 63 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
64 64
65 /* This is an autofs submount, we can't expire it */ 65 /* This is an autofs submount, we can't expire it */
66 if (sbi->type == AUTOFS_TYPE_INDIRECT) 66 if (autofs_type_indirect(sbi->type))
67 goto done; 67 goto done;
68 68
69 /* 69 /*
@@ -490,7 +490,7 @@ int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt,
490 if (arg && get_user(do_now, arg)) 490 if (arg && get_user(do_now, arg))
491 return -EFAULT; 491 return -EFAULT;
492 492
493 if (sbi->type & AUTOFS_TYPE_TRIGGER) 493 if (autofs_type_trigger(sbi->type))
494 dentry = autofs4_expire_direct(sb, mnt, sbi, do_now); 494 dentry = autofs4_expire_direct(sb, mnt, sbi, do_now);
495 else 495 else
496 dentry = autofs4_expire_indirect(sb, mnt, sbi, do_now); 496 dentry = autofs4_expire_indirect(sb, mnt, sbi, do_now);
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 7b19802cfef4..716e12b627b2 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -197,9 +197,9 @@ static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt)
197 seq_printf(m, ",minproto=%d", sbi->min_proto); 197 seq_printf(m, ",minproto=%d", sbi->min_proto);
198 seq_printf(m, ",maxproto=%d", sbi->max_proto); 198 seq_printf(m, ",maxproto=%d", sbi->max_proto);
199 199
200 if (sbi->type & AUTOFS_TYPE_OFFSET) 200 if (autofs_type_offset(sbi->type))
201 seq_printf(m, ",offset"); 201 seq_printf(m, ",offset");
202 else if (sbi->type & AUTOFS_TYPE_DIRECT) 202 else if (autofs_type_direct(sbi->type))
203 seq_printf(m, ",direct"); 203 seq_printf(m, ",direct");
204 else 204 else
205 seq_printf(m, ",indirect"); 205 seq_printf(m, ",indirect");
@@ -284,13 +284,13 @@ static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid,
284 *maxproto = option; 284 *maxproto = option;
285 break; 285 break;
286 case Opt_indirect: 286 case Opt_indirect:
287 *type = AUTOFS_TYPE_INDIRECT; 287 set_autofs_type_indirect(type);
288 break; 288 break;
289 case Opt_direct: 289 case Opt_direct:
290 *type = AUTOFS_TYPE_DIRECT; 290 set_autofs_type_direct(type);
291 break; 291 break;
292 case Opt_offset: 292 case Opt_offset:
293 *type = AUTOFS_TYPE_OFFSET; 293 set_autofs_type_offset(type);
294 break; 294 break;
295 default: 295 default:
296 return 1; 296 return 1;
@@ -338,7 +338,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
338 sbi->sb = s; 338 sbi->sb = s;
339 sbi->version = 0; 339 sbi->version = 0;
340 sbi->sub_version = 0; 340 sbi->sub_version = 0;
341 sbi->type = AUTOFS_TYPE_INDIRECT; 341 set_autofs_type_indirect(&sbi->type);
342 sbi->min_proto = 0; 342 sbi->min_proto = 0;
343 sbi->max_proto = 0; 343 sbi->max_proto = 0;
344 mutex_init(&sbi->wq_mutex); 344 mutex_init(&sbi->wq_mutex);
@@ -380,7 +380,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
380 } 380 }
381 381
382 root_inode->i_fop = &autofs4_root_operations; 382 root_inode->i_fop = &autofs4_root_operations;
383 root_inode->i_op = sbi->type & AUTOFS_TYPE_TRIGGER ? 383 root_inode->i_op = autofs_type_trigger(sbi->type) ?
384 &autofs4_direct_root_inode_operations : 384 &autofs4_direct_root_inode_operations :
385 &autofs4_indirect_root_inode_operations; 385 &autofs4_indirect_root_inode_operations;
386 386
@@ -455,11 +455,7 @@ struct inode *autofs4_get_inode(struct super_block *sb,
455 if (sb->s_root) { 455 if (sb->s_root) {
456 inode->i_uid = sb->s_root->d_inode->i_uid; 456 inode->i_uid = sb->s_root->d_inode->i_uid;
457 inode->i_gid = sb->s_root->d_inode->i_gid; 457 inode->i_gid = sb->s_root->d_inode->i_gid;
458 } else {
459 inode->i_uid = 0;
460 inode->i_gid = 0;
461 } 458 }
462 inode->i_blocks = 0;
463 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 459 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
464 460
465 if (S_ISDIR(inf->mode)) { 461 if (S_ISDIR(inf->mode)) {
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index e02cc8ae5eb3..eeb246845909 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -337,7 +337,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
337 * is very similar for indirect mounts except only dentrys 337 * is very similar for indirect mounts except only dentrys
338 * in the root of the autofs file system may be negative. 338 * in the root of the autofs file system may be negative.
339 */ 339 */
340 if (sbi->type & AUTOFS_TYPE_TRIGGER) 340 if (autofs_type_trigger(sbi->type))
341 return -ENOENT; 341 return -ENOENT;
342 else if (!IS_ROOT(dentry->d_parent)) 342 else if (!IS_ROOT(dentry->d_parent))
343 return -ENOENT; 343 return -ENOENT;
@@ -348,7 +348,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
348 return -ENOMEM; 348 return -ENOMEM;
349 349
350 /* If this is a direct mount request create a dummy name */ 350 /* If this is a direct mount request create a dummy name */
351 if (IS_ROOT(dentry) && sbi->type & AUTOFS_TYPE_TRIGGER) 351 if (IS_ROOT(dentry) && autofs_type_trigger(sbi->type))
352 qstr.len = sprintf(name, "%p", dentry); 352 qstr.len = sprintf(name, "%p", dentry);
353 else { 353 else {
354 qstr.len = autofs4_getpath(sbi, dentry, &name); 354 qstr.len = autofs4_getpath(sbi, dentry, &name);
@@ -406,11 +406,11 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
406 type = autofs_ptype_expire_multi; 406 type = autofs_ptype_expire_multi;
407 } else { 407 } else {
408 if (notify == NFY_MOUNT) 408 if (notify == NFY_MOUNT)
409 type = (sbi->type & AUTOFS_TYPE_TRIGGER) ? 409 type = autofs_type_trigger(sbi->type) ?
410 autofs_ptype_missing_direct : 410 autofs_ptype_missing_direct :
411 autofs_ptype_missing_indirect; 411 autofs_ptype_missing_indirect;
412 else 412 else
413 type = (sbi->type & AUTOFS_TYPE_TRIGGER) ? 413 type = autofs_type_trigger(sbi->type) ?
414 autofs_ptype_expire_direct : 414 autofs_ptype_expire_direct :
415 autofs_ptype_expire_indirect; 415 autofs_ptype_expire_indirect;
416 } 416 }
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 5f1538c03b1b..a05287a23f62 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -132,11 +132,6 @@ static int bad_file_check_flags(int flags)
132 return -EIO; 132 return -EIO;
133} 133}
134 134
135static int bad_file_dir_notify(struct file *file, unsigned long arg)
136{
137 return -EIO;
138}
139
140static int bad_file_flock(struct file *filp, int cmd, struct file_lock *fl) 135static int bad_file_flock(struct file *filp, int cmd, struct file_lock *fl)
141{ 136{
142 return -EIO; 137 return -EIO;
@@ -179,7 +174,6 @@ static const struct file_operations bad_file_ops =
179 .sendpage = bad_file_sendpage, 174 .sendpage = bad_file_sendpage,
180 .get_unmapped_area = bad_file_get_unmapped_area, 175 .get_unmapped_area = bad_file_get_unmapped_area,
181 .check_flags = bad_file_check_flags, 176 .check_flags = bad_file_check_flags,
182 .dir_notify = bad_file_dir_notify,
183 .flock = bad_file_flock, 177 .flock = bad_file_flock,
184 .splice_write = bad_file_splice_write, 178 .splice_write = bad_file_splice_write,
185 .splice_read = bad_file_splice_read, 179 .splice_read = bad_file_splice_read,
diff --git a/fs/befs/Kconfig b/fs/befs/Kconfig
new file mode 100644
index 000000000000..7835d30f211f
--- /dev/null
+++ b/fs/befs/Kconfig
@@ -0,0 +1,26 @@
1config BEFS_FS
2 tristate "BeOS file system (BeFS) support (read only) (EXPERIMENTAL)"
3 depends on BLOCK && EXPERIMENTAL
4 select NLS
5 help
6 The BeOS File System (BeFS) is the native file system of Be, Inc's
7 BeOS. Notable features include support for arbitrary attributes
8 on files and directories, and database-like indices on selected
9 attributes. (Also note that this driver doesn't make those features
10 available at this time). It is a 64 bit filesystem, so it supports
11 extremely large volumes and files.
12
13 If you use this filesystem, you should also say Y to at least one
14 of the NLS (native language support) options below.
15
16 If you don't know what this is about, say N.
17
18 To compile this as a module, choose M here: the module will be
19 called befs.
20
21config BEFS_DEBUG
22 bool "Debug BeFS"
23 depends on BEFS_FS
24 help
25 If you say Y here, you can use the 'debug' mount option to enable
26 debugging output from the driver.
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index b6dfee37c7b7..d06cb023ad02 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -378,7 +378,8 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
378 inode->i_size = 0; 378 inode->i_size = 0;
379 inode->i_blocks = befs_sb->block_size / VFS_BLOCK_SIZE; 379 inode->i_blocks = befs_sb->block_size / VFS_BLOCK_SIZE;
380 strncpy(befs_ino->i_data.symlink, raw_inode->data.symlink, 380 strncpy(befs_ino->i_data.symlink, raw_inode->data.symlink,
381 BEFS_SYMLINK_LEN); 381 BEFS_SYMLINK_LEN - 1);
382 befs_ino->i_data.symlink[BEFS_SYMLINK_LEN - 1] = '\0';
382 } else { 383 } else {
383 int num_blks; 384 int num_blks;
384 385
@@ -477,6 +478,8 @@ befs_follow_link(struct dentry *dentry, struct nameidata *nd)
477 kfree(link); 478 kfree(link);
478 befs_error(sb, "Failed to read entire long symlink"); 479 befs_error(sb, "Failed to read entire long symlink");
479 link = ERR_PTR(-EIO); 480 link = ERR_PTR(-EIO);
481 } else {
482 link[len - 1] = '\0';
480 } 483 }
481 } else { 484 } else {
482 link = befs_ino->i_data.symlink; 485 link = befs_ino->i_data.symlink;
diff --git a/fs/bfs/Kconfig b/fs/bfs/Kconfig
new file mode 100644
index 000000000000..c2336c62024f
--- /dev/null
+++ b/fs/bfs/Kconfig
@@ -0,0 +1,19 @@
1config BFS_FS
2 tristate "BFS file system support (EXPERIMENTAL)"
3 depends on BLOCK && EXPERIMENTAL
4 help
5 Boot File System (BFS) is a file system used under SCO UnixWare to
6 allow the bootloader access to the kernel image and other important
7 files during the boot process. It is usually mounted under /stand
8 and corresponds to the slice marked as "STAND" in the UnixWare
9 partition. You should say Y if you want to read or write the files
10 on your /stand slice from within Linux. You then also need to say Y
11 to "UnixWare slices support", below. More information about the BFS
12 file system is contained in the file
13 <file:Documentation/filesystems/bfs.txt>.
14
15 If you don't know what this is about, say N.
16
17 To compile this as a module, choose M here: the module will be called
18 bfs. Note that the file system of your root partition (the one
19 containing the directory /) cannot be compiled as a module.
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 0ed57b5ee012..cc4062d12ca2 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -213,6 +213,9 @@ static void bfs_put_super(struct super_block *s)
213{ 213{
214 struct bfs_sb_info *info = BFS_SB(s); 214 struct bfs_sb_info *info = BFS_SB(s);
215 215
216 if (!info)
217 return;
218
216 brelse(info->si_sbh); 219 brelse(info->si_sbh);
217 mutex_destroy(&info->bfs_lock); 220 mutex_destroy(&info->bfs_lock);
218 kfree(info->si_imap); 221 kfree(info->si_imap);
@@ -327,6 +330,7 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
327 unsigned i, imap_len; 330 unsigned i, imap_len;
328 struct bfs_sb_info *info; 331 struct bfs_sb_info *info;
329 long ret = -EINVAL; 332 long ret = -EINVAL;
333 unsigned long i_sblock, i_eblock, i_eoff, s_size;
330 334
331 info = kzalloc(sizeof(*info), GFP_KERNEL); 335 info = kzalloc(sizeof(*info), GFP_KERNEL);
332 if (!info) 336 if (!info)
@@ -350,6 +354,12 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
350 354
351 s->s_magic = BFS_MAGIC; 355 s->s_magic = BFS_MAGIC;
352 info->si_sbh = bh; 356 info->si_sbh = bh;
357
358 if (le32_to_cpu(bfs_sb->s_start) > le32_to_cpu(bfs_sb->s_end)) {
359 printf("Superblock is corrupted\n");
360 goto out;
361 }
362
353 info->si_lasti = (le32_to_cpu(bfs_sb->s_start) - BFS_BSIZE) / 363 info->si_lasti = (le32_to_cpu(bfs_sb->s_start) - BFS_BSIZE) /
354 sizeof(struct bfs_inode) 364 sizeof(struct bfs_inode)
355 + BFS_ROOT_INO - 1; 365 + BFS_ROOT_INO - 1;
@@ -380,6 +390,18 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
380 - le32_to_cpu(bfs_sb->s_start)) >> BFS_BSIZE_BITS; 390 - le32_to_cpu(bfs_sb->s_start)) >> BFS_BSIZE_BITS;
381 info->si_freei = 0; 391 info->si_freei = 0;
382 info->si_lf_eblk = 0; 392 info->si_lf_eblk = 0;
393
394 /* can we read the last block? */
395 bh = sb_bread(s, info->si_blocks - 1);
396 if (!bh) {
397 printf("Last block not available: %lu\n", info->si_blocks - 1);
398 iput(inode);
399 ret = -EIO;
400 kfree(info->si_imap);
401 goto out;
402 }
403 brelse(bh);
404
383 bh = NULL; 405 bh = NULL;
384 for (i = BFS_ROOT_INO; i <= info->si_lasti; i++) { 406 for (i = BFS_ROOT_INO; i <= info->si_lasti; i++) {
385 struct bfs_inode *di; 407 struct bfs_inode *di;
@@ -397,6 +419,29 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
397 419
398 di = (struct bfs_inode *)bh->b_data + off; 420 di = (struct bfs_inode *)bh->b_data + off;
399 421
422 /* test if filesystem is not corrupted */
423
424 i_eoff = le32_to_cpu(di->i_eoffset);
425 i_sblock = le32_to_cpu(di->i_sblock);
426 i_eblock = le32_to_cpu(di->i_eblock);
427 s_size = le32_to_cpu(bfs_sb->s_end);
428
429 if (i_sblock > info->si_blocks ||
430 i_eblock > info->si_blocks ||
431 i_sblock > i_eblock ||
432 i_eoff > s_size ||
433 i_sblock * BFS_BSIZE > i_eoff) {
434
435 printf("Inode 0x%08x corrupted\n", i);
436
437 brelse(bh);
438 s->s_root = NULL;
439 kfree(info->si_imap);
440 kfree(info);
441 s->s_fs_info = NULL;
442 return -EIO;
443 }
444
400 if (!di->i_ino) { 445 if (!di->i_ino) {
401 info->si_freei++; 446 info->si_freei++;
402 continue; 447 continue;
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index f1f3f4192a60..b639dcf7c778 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -95,92 +95,55 @@ static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file, u
95 int has_dumped = 0; 95 int has_dumped = 0;
96 unsigned long dump_start, dump_size; 96 unsigned long dump_start, dump_size;
97 struct user dump; 97 struct user dump;
98#if defined(__alpha__) 98#ifdef __alpha__
99# define START_DATA(u) (u.start_data) 99# define START_DATA(u) (u.start_data)
100#elif defined(__arm__) 100#else
101# define START_DATA(u) ((u.u_tsize << PAGE_SHIFT) + u.start_code) 101# define START_DATA(u) ((u.u_tsize << PAGE_SHIFT) + u.start_code)
102#elif defined(__sparc__)
103# define START_DATA(u) (u.u_tsize)
104#elif defined(__i386__) || defined(__mc68000__) || defined(__arch_um__)
105# define START_DATA(u) (u.u_tsize << PAGE_SHIFT)
106#endif 102#endif
107#ifdef __sparc__
108# define START_STACK(u) ((regs->u_regs[UREG_FP]) & ~(PAGE_SIZE - 1))
109#else
110# define START_STACK(u) (u.start_stack) 103# define START_STACK(u) (u.start_stack)
111#endif
112 104
113 fs = get_fs(); 105 fs = get_fs();
114 set_fs(KERNEL_DS); 106 set_fs(KERNEL_DS);
115 has_dumped = 1; 107 has_dumped = 1;
116 current->flags |= PF_DUMPCORE; 108 current->flags |= PF_DUMPCORE;
117 strncpy(dump.u_comm, current->comm, sizeof(dump.u_comm)); 109 strncpy(dump.u_comm, current->comm, sizeof(dump.u_comm));
118#ifndef __sparc__
119 dump.u_ar0 = offsetof(struct user, regs); 110 dump.u_ar0 = offsetof(struct user, regs);
120#endif
121 dump.signal = signr; 111 dump.signal = signr;
122 aout_dump_thread(regs, &dump); 112 aout_dump_thread(regs, &dump);
123 113
124/* If the size of the dump file exceeds the rlimit, then see what would happen 114/* If the size of the dump file exceeds the rlimit, then see what would happen
125 if we wrote the stack, but not the data area. */ 115 if we wrote the stack, but not the data area. */
126#ifdef __sparc__
127 if ((dump.u_dsize + dump.u_ssize) > limit)
128 dump.u_dsize = 0;
129#else
130 if ((dump.u_dsize + dump.u_ssize+1) * PAGE_SIZE > limit) 116 if ((dump.u_dsize + dump.u_ssize+1) * PAGE_SIZE > limit)
131 dump.u_dsize = 0; 117 dump.u_dsize = 0;
132#endif
133 118
134/* Make sure we have enough room to write the stack and data areas. */ 119/* Make sure we have enough room to write the stack and data areas. */
135#ifdef __sparc__
136 if (dump.u_ssize > limit)
137 dump.u_ssize = 0;
138#else
139 if ((dump.u_ssize + 1) * PAGE_SIZE > limit) 120 if ((dump.u_ssize + 1) * PAGE_SIZE > limit)
140 dump.u_ssize = 0; 121 dump.u_ssize = 0;
141#endif
142 122
143/* make sure we actually have a data and stack area to dump */ 123/* make sure we actually have a data and stack area to dump */
144 set_fs(USER_DS); 124 set_fs(USER_DS);
145#ifdef __sparc__
146 if (!access_ok(VERIFY_READ, (void __user *)START_DATA(dump), dump.u_dsize))
147 dump.u_dsize = 0;
148 if (!access_ok(VERIFY_READ, (void __user *)START_STACK(dump), dump.u_ssize))
149 dump.u_ssize = 0;
150#else
151 if (!access_ok(VERIFY_READ, (void __user *)START_DATA(dump), dump.u_dsize << PAGE_SHIFT)) 125 if (!access_ok(VERIFY_READ, (void __user *)START_DATA(dump), dump.u_dsize << PAGE_SHIFT))
152 dump.u_dsize = 0; 126 dump.u_dsize = 0;
153 if (!access_ok(VERIFY_READ, (void __user *)START_STACK(dump), dump.u_ssize << PAGE_SHIFT)) 127 if (!access_ok(VERIFY_READ, (void __user *)START_STACK(dump), dump.u_ssize << PAGE_SHIFT))
154 dump.u_ssize = 0; 128 dump.u_ssize = 0;
155#endif
156 129
157 set_fs(KERNEL_DS); 130 set_fs(KERNEL_DS);
158/* struct user */ 131/* struct user */
159 DUMP_WRITE(&dump,sizeof(dump)); 132 DUMP_WRITE(&dump,sizeof(dump));
160/* Now dump all of the user data. Include malloced stuff as well */ 133/* Now dump all of the user data. Include malloced stuff as well */
161#ifndef __sparc__
162 DUMP_SEEK(PAGE_SIZE); 134 DUMP_SEEK(PAGE_SIZE);
163#endif
164/* now we start writing out the user space info */ 135/* now we start writing out the user space info */
165 set_fs(USER_DS); 136 set_fs(USER_DS);
166/* Dump the data area */ 137/* Dump the data area */
167 if (dump.u_dsize != 0) { 138 if (dump.u_dsize != 0) {
168 dump_start = START_DATA(dump); 139 dump_start = START_DATA(dump);
169#ifdef __sparc__
170 dump_size = dump.u_dsize;
171#else
172 dump_size = dump.u_dsize << PAGE_SHIFT; 140 dump_size = dump.u_dsize << PAGE_SHIFT;
173#endif
174 DUMP_WRITE(dump_start,dump_size); 141 DUMP_WRITE(dump_start,dump_size);
175 } 142 }
176/* Now prepare to dump the stack area */ 143/* Now prepare to dump the stack area */
177 if (dump.u_ssize != 0) { 144 if (dump.u_ssize != 0) {
178 dump_start = START_STACK(dump); 145 dump_start = START_STACK(dump);
179#ifdef __sparc__
180 dump_size = dump.u_ssize;
181#else
182 dump_size = dump.u_ssize << PAGE_SHIFT; 146 dump_size = dump.u_ssize << PAGE_SHIFT;
183#endif
184 DUMP_WRITE(dump_start,dump_size); 147 DUMP_WRITE(dump_start,dump_size);
185 } 148 }
186/* Finally dump the task struct. Not be used by gdb, but could be useful */ 149/* Finally dump the task struct. Not be used by gdb, but could be useful */
@@ -205,29 +168,24 @@ static unsigned long __user *create_aout_tables(char __user *p, struct linux_bin
205 int envc = bprm->envc; 168 int envc = bprm->envc;
206 169
207 sp = (void __user *)((-(unsigned long)sizeof(char *)) & (unsigned long) p); 170 sp = (void __user *)((-(unsigned long)sizeof(char *)) & (unsigned long) p);
208#ifdef __sparc__
209 /* This imposes the proper stack alignment for a new process. */
210 sp = (void __user *) (((unsigned long) sp) & ~7);
211 if ((envc+argc+3)&1) --sp;
212#endif
213#ifdef __alpha__ 171#ifdef __alpha__
214/* whee.. test-programs are so much fun. */ 172/* whee.. test-programs are so much fun. */
215 put_user(0, --sp); 173 put_user(0, --sp);
216 put_user(0, --sp); 174 put_user(0, --sp);
217 if (bprm->loader) { 175 if (bprm->loader) {
218 put_user(0, --sp); 176 put_user(0, --sp);
219 put_user(0x3eb, --sp); 177 put_user(1003, --sp);
220 put_user(bprm->loader, --sp); 178 put_user(bprm->loader, --sp);
221 put_user(0x3ea, --sp); 179 put_user(1002, --sp);
222 } 180 }
223 put_user(bprm->exec, --sp); 181 put_user(bprm->exec, --sp);
224 put_user(0x3e9, --sp); 182 put_user(1001, --sp);
225#endif 183#endif
226 sp -= envc+1; 184 sp -= envc+1;
227 envp = (char __user * __user *) sp; 185 envp = (char __user * __user *) sp;
228 sp -= argc+1; 186 sp -= argc+1;
229 argv = (char __user * __user *) sp; 187 argv = (char __user * __user *) sp;
230#if defined(__i386__) || defined(__mc68000__) || defined(__arm__) || defined(__arch_um__) 188#ifndef __alpha__
231 put_user((unsigned long) envp,--sp); 189 put_user((unsigned long) envp,--sp);
232 put_user((unsigned long) argv,--sp); 190 put_user((unsigned long) argv,--sp);
233#endif 191#endif
@@ -300,13 +258,8 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
300 return retval; 258 return retval;
301 259
302 /* OK, This is the point of no return */ 260 /* OK, This is the point of no return */
303#if defined(__alpha__) 261#ifdef __alpha__
304 SET_AOUT_PERSONALITY(bprm, ex); 262 SET_AOUT_PERSONALITY(bprm, ex);
305#elif defined(__sparc__)
306 set_personality(PER_SUNOS);
307#if !defined(__sparc_v9__)
308 memcpy(&current->thread.core_exec, &ex, sizeof(struct exec));
309#endif
310#else 263#else
311 set_personality(PER_LINUX); 264 set_personality(PER_LINUX);
312#endif 265#endif
@@ -322,24 +275,6 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
322 275
323 install_exec_creds(bprm); 276 install_exec_creds(bprm);
324 current->flags &= ~PF_FORKNOEXEC; 277 current->flags &= ~PF_FORKNOEXEC;
325#ifdef __sparc__
326 if (N_MAGIC(ex) == NMAGIC) {
327 loff_t pos = fd_offset;
328 /* Fuck me plenty... */
329 /* <AOL></AOL> */
330 down_write(&current->mm->mmap_sem);
331 error = do_brk(N_TXTADDR(ex), ex.a_text);
332 up_write(&current->mm->mmap_sem);
333 bprm->file->f_op->read(bprm->file, (char *) N_TXTADDR(ex),
334 ex.a_text, &pos);
335 down_write(&current->mm->mmap_sem);
336 error = do_brk(N_DATADDR(ex), ex.a_data);
337 up_write(&current->mm->mmap_sem);
338 bprm->file->f_op->read(bprm->file, (char *) N_DATADDR(ex),
339 ex.a_data, &pos);
340 goto beyond_if;
341 }
342#endif
343 278
344 if (N_MAGIC(ex) == OMAGIC) { 279 if (N_MAGIC(ex) == OMAGIC) {
345 unsigned long text_addr, map_size; 280 unsigned long text_addr, map_size;
@@ -347,7 +282,7 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
347 282
348 text_addr = N_TXTADDR(ex); 283 text_addr = N_TXTADDR(ex);
349 284
350#if defined(__alpha__) || defined(__sparc__) 285#ifdef __alpha__
351 pos = fd_offset; 286 pos = fd_offset;
352 map_size = ex.a_text+ex.a_data + PAGE_SIZE - 1; 287 map_size = ex.a_text+ex.a_data + PAGE_SIZE - 1;
353#else 288#else
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index c41fa2af7677..e3ff2b9e602f 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -152,8 +152,10 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
152 elf_addr_t __user *sp; 152 elf_addr_t __user *sp;
153 elf_addr_t __user *u_platform; 153 elf_addr_t __user *u_platform;
154 elf_addr_t __user *u_base_platform; 154 elf_addr_t __user *u_base_platform;
155 elf_addr_t __user *u_rand_bytes;
155 const char *k_platform = ELF_PLATFORM; 156 const char *k_platform = ELF_PLATFORM;
156 const char *k_base_platform = ELF_BASE_PLATFORM; 157 const char *k_base_platform = ELF_BASE_PLATFORM;
158 unsigned char k_rand_bytes[16];
157 int items; 159 int items;
158 elf_addr_t *elf_info; 160 elf_addr_t *elf_info;
159 int ei_index = 0; 161 int ei_index = 0;
@@ -196,6 +198,15 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
196 return -EFAULT; 198 return -EFAULT;
197 } 199 }
198 200
201 /*
202 * Generate 16 random bytes for userspace PRNG seeding.
203 */
204 get_random_bytes(k_rand_bytes, sizeof(k_rand_bytes));
205 u_rand_bytes = (elf_addr_t __user *)
206 STACK_ALLOC(p, sizeof(k_rand_bytes));
207 if (__copy_to_user(u_rand_bytes, k_rand_bytes, sizeof(k_rand_bytes)))
208 return -EFAULT;
209
199 /* Create the ELF interpreter info */ 210 /* Create the ELF interpreter info */
200 elf_info = (elf_addr_t *)current->mm->saved_auxv; 211 elf_info = (elf_addr_t *)current->mm->saved_auxv;
201 /* update AT_VECTOR_SIZE_BASE if the number of NEW_AUX_ENT() changes */ 212 /* update AT_VECTOR_SIZE_BASE if the number of NEW_AUX_ENT() changes */
@@ -228,6 +239,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
228 NEW_AUX_ENT(AT_GID, cred->gid); 239 NEW_AUX_ENT(AT_GID, cred->gid);
229 NEW_AUX_ENT(AT_EGID, cred->egid); 240 NEW_AUX_ENT(AT_EGID, cred->egid);
230 NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm)); 241 NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm));
242 NEW_AUX_ENT(AT_RANDOM, (elf_addr_t)(unsigned long)u_rand_bytes);
231 NEW_AUX_ENT(AT_EXECFN, bprm->exec); 243 NEW_AUX_ENT(AT_EXECFN, bprm->exec);
232 if (k_platform) { 244 if (k_platform) {
233 NEW_AUX_ENT(AT_PLATFORM, 245 NEW_AUX_ENT(AT_PLATFORM,
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index aa5b43205e37..f3e72c5c19f5 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -168,9 +168,6 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
168 struct elf_fdpic_params exec_params, interp_params; 168 struct elf_fdpic_params exec_params, interp_params;
169 struct elf_phdr *phdr; 169 struct elf_phdr *phdr;
170 unsigned long stack_size, entryaddr; 170 unsigned long stack_size, entryaddr;
171#ifndef CONFIG_MMU
172 unsigned long fullsize;
173#endif
174#ifdef ELF_FDPIC_PLAT_INIT 171#ifdef ELF_FDPIC_PLAT_INIT
175 unsigned long dynaddr; 172 unsigned long dynaddr;
176#endif 173#endif
@@ -390,11 +387,6 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
390 goto error_kill; 387 goto error_kill;
391 } 388 }
392 389
393 /* expand the stack mapping to use up the entire allocation granule */
394 fullsize = kobjsize((char *) current->mm->start_brk);
395 if (!IS_ERR_VALUE(do_mremap(current->mm->start_brk, stack_size,
396 fullsize, 0, 0)))
397 stack_size = fullsize;
398 up_write(&current->mm->mmap_sem); 390 up_write(&current->mm->mmap_sem);
399 391
400 current->mm->brk = current->mm->start_brk; 392 current->mm->brk = current->mm->start_brk;
@@ -1567,11 +1559,9 @@ end_coredump:
1567static int elf_fdpic_dump_segments(struct file *file, size_t *size, 1559static int elf_fdpic_dump_segments(struct file *file, size_t *size,
1568 unsigned long *limit, unsigned long mm_flags) 1560 unsigned long *limit, unsigned long mm_flags)
1569{ 1561{
1570 struct vm_list_struct *vml; 1562 struct vm_area_struct *vma;
1571
1572 for (vml = current->mm->context.vmlist; vml; vml = vml->next) {
1573 struct vm_area_struct *vma = vml->vma;
1574 1563
1564 for (vma = current->mm->mmap; vma; vma = vma->vm_next) {
1575 if (!maydump(vma, mm_flags)) 1565 if (!maydump(vma, mm_flags))
1576 continue; 1566 continue;
1577 1567
@@ -1617,9 +1607,6 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
1617 elf_fpxregset_t *xfpu = NULL; 1607 elf_fpxregset_t *xfpu = NULL;
1618#endif 1608#endif
1619 int thread_status_size = 0; 1609 int thread_status_size = 0;
1620#ifndef CONFIG_MMU
1621 struct vm_list_struct *vml;
1622#endif
1623 elf_addr_t *auxv; 1610 elf_addr_t *auxv;
1624 unsigned long mm_flags; 1611 unsigned long mm_flags;
1625 1612
@@ -1685,13 +1672,7 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
1685 fill_prstatus(prstatus, current, signr); 1672 fill_prstatus(prstatus, current, signr);
1686 elf_core_copy_regs(&prstatus->pr_reg, regs); 1673 elf_core_copy_regs(&prstatus->pr_reg, regs);
1687 1674
1688#ifdef CONFIG_MMU
1689 segs = current->mm->map_count; 1675 segs = current->mm->map_count;
1690#else
1691 segs = 0;
1692 for (vml = current->mm->context.vmlist; vml; vml = vml->next)
1693 segs++;
1694#endif
1695#ifdef ELF_CORE_EXTRA_PHDRS 1676#ifdef ELF_CORE_EXTRA_PHDRS
1696 segs += ELF_CORE_EXTRA_PHDRS; 1677 segs += ELF_CORE_EXTRA_PHDRS;
1697#endif 1678#endif
@@ -1766,20 +1747,10 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
1766 mm_flags = current->mm->flags; 1747 mm_flags = current->mm->flags;
1767 1748
1768 /* write program headers for segments dump */ 1749 /* write program headers for segments dump */
1769 for ( 1750 for (vma = current->mm->mmap; vma; vma = vma->vm_next) {
1770#ifdef CONFIG_MMU
1771 vma = current->mm->mmap; vma; vma = vma->vm_next
1772#else
1773 vml = current->mm->context.vmlist; vml; vml = vml->next
1774#endif
1775 ) {
1776 struct elf_phdr phdr; 1751 struct elf_phdr phdr;
1777 size_t sz; 1752 size_t sz;
1778 1753
1779#ifndef CONFIG_MMU
1780 vma = vml->vma;
1781#endif
1782
1783 sz = vma->vm_end - vma->vm_start; 1754 sz = vma->vm_end - vma->vm_start;
1784 1755
1785 phdr.p_type = PT_LOAD; 1756 phdr.p_type = PT_LOAD;
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 7bbd5c6b3725..5cebf0b37798 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -417,8 +417,8 @@ static int load_flat_file(struct linux_binprm * bprm,
417 unsigned long textpos = 0, datapos = 0, result; 417 unsigned long textpos = 0, datapos = 0, result;
418 unsigned long realdatastart = 0; 418 unsigned long realdatastart = 0;
419 unsigned long text_len, data_len, bss_len, stack_len, flags; 419 unsigned long text_len, data_len, bss_len, stack_len, flags;
420 unsigned long len, reallen, memp = 0; 420 unsigned long len, memp = 0;
421 unsigned long extra, rlim; 421 unsigned long memp_size, extra, rlim;
422 unsigned long *reloc = 0, *rp; 422 unsigned long *reloc = 0, *rp;
423 struct inode *inode; 423 struct inode *inode;
424 int i, rev, relocs = 0; 424 int i, rev, relocs = 0;
@@ -543,17 +543,10 @@ static int load_flat_file(struct linux_binprm * bprm,
543 } 543 }
544 544
545 len = data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long); 545 len = data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long);
546 len = PAGE_ALIGN(len);
546 down_write(&current->mm->mmap_sem); 547 down_write(&current->mm->mmap_sem);
547 realdatastart = do_mmap(0, 0, len, 548 realdatastart = do_mmap(0, 0, len,
548 PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0); 549 PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0);
549 /* Remap to use all availabe slack region space */
550 if (realdatastart && (realdatastart < (unsigned long)-4096)) {
551 reallen = kobjsize((void *)realdatastart);
552 if (reallen > len) {
553 realdatastart = do_mremap(realdatastart, len,
554 reallen, MREMAP_FIXED, realdatastart);
555 }
556 }
557 up_write(&current->mm->mmap_sem); 550 up_write(&current->mm->mmap_sem);
558 551
559 if (realdatastart == 0 || realdatastart >= (unsigned long)-4096) { 552 if (realdatastart == 0 || realdatastart >= (unsigned long)-4096) {
@@ -591,21 +584,14 @@ static int load_flat_file(struct linux_binprm * bprm,
591 584
592 reloc = (unsigned long *) (datapos+(ntohl(hdr->reloc_start)-text_len)); 585 reloc = (unsigned long *) (datapos+(ntohl(hdr->reloc_start)-text_len));
593 memp = realdatastart; 586 memp = realdatastart;
594 587 memp_size = len;
595 } else { 588 } else {
596 589
597 len = text_len + data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long); 590 len = text_len + data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long);
591 len = PAGE_ALIGN(len);
598 down_write(&current->mm->mmap_sem); 592 down_write(&current->mm->mmap_sem);
599 textpos = do_mmap(0, 0, len, 593 textpos = do_mmap(0, 0, len,
600 PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0); 594 PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0);
601 /* Remap to use all availabe slack region space */
602 if (textpos && (textpos < (unsigned long) -4096)) {
603 reallen = kobjsize((void *)textpos);
604 if (reallen > len) {
605 textpos = do_mremap(textpos, len, reallen,
606 MREMAP_FIXED, textpos);
607 }
608 }
609 up_write(&current->mm->mmap_sem); 595 up_write(&current->mm->mmap_sem);
610 596
611 if (!textpos || textpos >= (unsigned long) -4096) { 597 if (!textpos || textpos >= (unsigned long) -4096) {
@@ -622,7 +608,7 @@ static int load_flat_file(struct linux_binprm * bprm,
622 reloc = (unsigned long *) (textpos + ntohl(hdr->reloc_start) + 608 reloc = (unsigned long *) (textpos + ntohl(hdr->reloc_start) +
623 MAX_SHARED_LIBS * sizeof(unsigned long)); 609 MAX_SHARED_LIBS * sizeof(unsigned long));
624 memp = textpos; 610 memp = textpos;
625 611 memp_size = len;
626#ifdef CONFIG_BINFMT_ZFLAT 612#ifdef CONFIG_BINFMT_ZFLAT
627 /* 613 /*
628 * load it all in and treat it like a RAM load from now on 614 * load it all in and treat it like a RAM load from now on
@@ -680,10 +666,12 @@ static int load_flat_file(struct linux_binprm * bprm,
680 * set up the brk stuff, uses any slack left in data/bss/stack 666 * set up the brk stuff, uses any slack left in data/bss/stack
681 * allocation. We put the brk after the bss (between the bss 667 * allocation. We put the brk after the bss (between the bss
682 * and stack) like other platforms. 668 * and stack) like other platforms.
669 * Userspace code relies on the stack pointer starting out at
670 * an address right at the end of a page.
683 */ 671 */
684 current->mm->start_brk = datapos + data_len + bss_len; 672 current->mm->start_brk = datapos + data_len + bss_len;
685 current->mm->brk = (current->mm->start_brk + 3) & ~3; 673 current->mm->brk = (current->mm->start_brk + 3) & ~3;
686 current->mm->context.end_brk = memp + kobjsize((void *) memp) - stack_len; 674 current->mm->context.end_brk = memp + memp_size - stack_len;
687 } 675 }
688 676
689 if (flags & FLAT_FLAG_KTRACE) 677 if (flags & FLAT_FLAG_KTRACE)
@@ -790,8 +778,8 @@ static int load_flat_file(struct linux_binprm * bprm,
790 778
791 /* zero the BSS, BRK and stack areas */ 779 /* zero the BSS, BRK and stack areas */
792 memset((void*)(datapos + data_len), 0, bss_len + 780 memset((void*)(datapos + data_len), 0, bss_len +
793 (memp + kobjsize((void *) memp) - stack_len - /* end brk */ 781 (memp + memp_size - stack_len - /* end brk */
794 libinfo->lib_list[id].start_brk) + /* start brk */ 782 libinfo->lib_list[id].start_brk) + /* start brk */
795 stack_len); 783 stack_len);
796 784
797 return 0; 785 return 0;
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index f2744ab4e5b3..c4e83537ead7 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -496,9 +496,6 @@ static struct inode *bm_get_inode(struct super_block *sb, int mode)
496 496
497 if (inode) { 497 if (inode) {
498 inode->i_mode = mode; 498 inode->i_mode = mode;
499 inode->i_uid = 0;
500 inode->i_gid = 0;
501 inode->i_blocks = 0;
502 inode->i_atime = inode->i_mtime = inode->i_ctime = 499 inode->i_atime = inode->i_mtime = inode->i_ctime =
503 current_fs_time(inode->i_sb); 500 current_fs_time(inode->i_sb);
504 } 501 }
@@ -652,7 +649,7 @@ static const struct file_operations bm_register_operations = {
652static ssize_t 649static ssize_t
653bm_status_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) 650bm_status_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
654{ 651{
655 char *s = enabled ? "enabled" : "disabled"; 652 char *s = enabled ? "enabled\n" : "disabled\n";
656 653
657 return simple_read_from_buffer(buf, nbytes, ppos, s, strlen(s)); 654 return simple_read_from_buffer(buf, nbytes, ppos, s, strlen(s));
658} 655}
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 77ebc3c263d6..549b0144da11 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -140,7 +140,6 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
140 140
141 iv = bip_vec_idx(bip, bip->bip_vcnt); 141 iv = bip_vec_idx(bip, bip->bip_vcnt);
142 BUG_ON(iv == NULL); 142 BUG_ON(iv == NULL);
143 BUG_ON(iv->bv_page != NULL);
144 143
145 iv->bv_page = page; 144 iv->bv_page = page;
146 iv->bv_len = len; 145 iv->bv_len = len;
@@ -465,7 +464,7 @@ static int bio_integrity_verify(struct bio *bio)
465 464
466 if (ret) { 465 if (ret) {
467 kunmap_atomic(kaddr, KM_USER0); 466 kunmap_atomic(kaddr, KM_USER0);
468 break; 467 return ret;
469 } 468 }
470 469
471 sectors = bv->bv_len / bi->sector_size; 470 sectors = bv->bv_len / bi->sector_size;
@@ -493,18 +492,13 @@ static void bio_integrity_verify_fn(struct work_struct *work)
493 struct bio_integrity_payload *bip = 492 struct bio_integrity_payload *bip =
494 container_of(work, struct bio_integrity_payload, bip_work); 493 container_of(work, struct bio_integrity_payload, bip_work);
495 struct bio *bio = bip->bip_bio; 494 struct bio *bio = bip->bip_bio;
496 int error = bip->bip_error; 495 int error;
497 496
498 if (bio_integrity_verify(bio)) { 497 error = bio_integrity_verify(bio);
499 clear_bit(BIO_UPTODATE, &bio->bi_flags);
500 error = -EIO;
501 }
502 498
503 /* Restore original bio completion handler */ 499 /* Restore original bio completion handler */
504 bio->bi_end_io = bip->bip_end_io; 500 bio->bi_end_io = bip->bip_end_io;
505 501 bio_endio(bio, error);
506 if (bio->bi_end_io)
507 bio->bi_end_io(bio, error);
508} 502}
509 503
510/** 504/**
@@ -525,7 +519,17 @@ void bio_integrity_endio(struct bio *bio, int error)
525 519
526 BUG_ON(bip->bip_bio != bio); 520 BUG_ON(bip->bip_bio != bio);
527 521
528 bip->bip_error = error; 522 /* In case of an I/O error there is no point in verifying the
523 * integrity metadata. Restore original bio end_io handler
524 * and run it.
525 */
526 if (error) {
527 bio->bi_end_io = bip->bip_end_io;
528 bio_endio(bio, error);
529
530 return;
531 }
532
529 INIT_WORK(&bip->bip_work, bio_integrity_verify_fn); 533 INIT_WORK(&bip->bip_work, bio_integrity_verify_fn);
530 queue_work(kintegrityd_wq, &bip->bip_work); 534 queue_work(kintegrityd_wq, &bip->bip_work);
531} 535}
diff --git a/fs/bio.c b/fs/bio.c
index 711cee103602..062299acbccd 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -788,6 +788,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
788 int i, ret; 788 int i, ret;
789 int nr_pages = 0; 789 int nr_pages = 0;
790 unsigned int len = 0; 790 unsigned int len = 0;
791 unsigned int offset = map_data ? map_data->offset & ~PAGE_MASK : 0;
791 792
792 for (i = 0; i < iov_count; i++) { 793 for (i = 0; i < iov_count; i++) {
793 unsigned long uaddr; 794 unsigned long uaddr;
@@ -814,35 +815,42 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
814 bio->bi_rw |= (!write_to_vm << BIO_RW); 815 bio->bi_rw |= (!write_to_vm << BIO_RW);
815 816
816 ret = 0; 817 ret = 0;
817 i = 0; 818
819 if (map_data) {
820 nr_pages = 1 << map_data->page_order;
821 i = map_data->offset / PAGE_SIZE;
822 }
818 while (len) { 823 while (len) {
819 unsigned int bytes; 824 unsigned int bytes = PAGE_SIZE;
820 825
821 if (map_data) 826 bytes -= offset;
822 bytes = 1U << (PAGE_SHIFT + map_data->page_order);
823 else
824 bytes = PAGE_SIZE;
825 827
826 if (bytes > len) 828 if (bytes > len)
827 bytes = len; 829 bytes = len;
828 830
829 if (map_data) { 831 if (map_data) {
830 if (i == map_data->nr_entries) { 832 if (i == map_data->nr_entries * nr_pages) {
831 ret = -ENOMEM; 833 ret = -ENOMEM;
832 break; 834 break;
833 } 835 }
834 page = map_data->pages[i++]; 836
835 } else 837 page = map_data->pages[i / nr_pages];
838 page += (i % nr_pages);
839
840 i++;
841 } else {
836 page = alloc_page(q->bounce_gfp | gfp_mask); 842 page = alloc_page(q->bounce_gfp | gfp_mask);
837 if (!page) { 843 if (!page) {
838 ret = -ENOMEM; 844 ret = -ENOMEM;
839 break; 845 break;
846 }
840 } 847 }
841 848
842 if (bio_add_pc_page(q, bio, page, bytes, 0) < bytes) 849 if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes)
843 break; 850 break;
844 851
845 len -= bytes; 852 len -= bytes;
853 offset = 0;
846 } 854 }
847 855
848 if (ret) 856 if (ret)
@@ -851,7 +859,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
851 /* 859 /*
852 * success 860 * success
853 */ 861 */
854 if (!write_to_vm) { 862 if (!write_to_vm && (!map_data || !map_data->null_mapped)) {
855 ret = __bio_copy_iov(bio, bio->bi_io_vec, iov, iov_count, 0, 0); 863 ret = __bio_copy_iov(bio, bio->bi_io_vec, iov, iov_count, 0, 0);
856 if (ret) 864 if (ret)
857 goto cleanup; 865 goto cleanup;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 99e0ae1a4c78..b3c1efff5e1d 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -285,6 +285,8 @@ static void init_once(void *foo)
285 INIT_LIST_HEAD(&bdev->bd_holder_list); 285 INIT_LIST_HEAD(&bdev->bd_holder_list);
286#endif 286#endif
287 inode_init_once(&ei->vfs_inode); 287 inode_init_once(&ei->vfs_inode);
288 /* Initialize mutex for freeze. */
289 mutex_init(&bdev->bd_fsfreeze_mutex);
288} 290}
289 291
290static inline void __bd_forget(struct inode *inode) 292static inline void __bd_forget(struct inode *inode)
@@ -326,12 +328,13 @@ static struct file_system_type bd_type = {
326 .kill_sb = kill_anon_super, 328 .kill_sb = kill_anon_super,
327}; 329};
328 330
329static struct vfsmount *bd_mnt __read_mostly; 331struct super_block *blockdev_superblock __read_mostly;
330struct super_block *blockdev_superblock;
331 332
332void __init bdev_cache_init(void) 333void __init bdev_cache_init(void)
333{ 334{
334 int err; 335 int err;
336 struct vfsmount *bd_mnt;
337
335 bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode), 338 bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
336 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| 339 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
337 SLAB_MEM_SPREAD|SLAB_PANIC), 340 SLAB_MEM_SPREAD|SLAB_PANIC),
@@ -373,7 +376,7 @@ struct block_device *bdget(dev_t dev)
373 struct block_device *bdev; 376 struct block_device *bdev;
374 struct inode *inode; 377 struct inode *inode;
375 378
376 inode = iget5_locked(bd_mnt->mnt_sb, hash(dev), 379 inode = iget5_locked(blockdev_superblock, hash(dev),
377 bdev_test, bdev_set, &dev); 380 bdev_test, bdev_set, &dev);
378 381
379 if (!inode) 382 if (!inode)
@@ -463,7 +466,7 @@ void bd_forget(struct inode *inode)
463 466
464 spin_lock(&bdev_lock); 467 spin_lock(&bdev_lock);
465 if (inode->i_bdev) { 468 if (inode->i_bdev) {
466 if (inode->i_sb != blockdev_superblock) 469 if (!sb_is_blkdev_sb(inode->i_sb))
467 bdev = inode->i_bdev; 470 bdev = inode->i_bdev;
468 __bd_forget(inode); 471 __bd_forget(inode);
469 } 472 }
@@ -1004,6 +1007,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1004 } 1007 }
1005 1008
1006 lock_kernel(); 1009 lock_kernel();
1010 restart:
1007 1011
1008 ret = -ENXIO; 1012 ret = -ENXIO;
1009 disk = get_gendisk(bdev->bd_dev, &partno); 1013 disk = get_gendisk(bdev->bd_dev, &partno);
@@ -1024,6 +1028,19 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1024 1028
1025 if (disk->fops->open) { 1029 if (disk->fops->open) {
1026 ret = disk->fops->open(bdev, mode); 1030 ret = disk->fops->open(bdev, mode);
1031 if (ret == -ERESTARTSYS) {
1032 /* Lost a race with 'disk' being
1033 * deleted, try again.
1034 * See md.c
1035 */
1036 disk_put_part(bdev->bd_part);
1037 bdev->bd_part = NULL;
1038 module_put(disk->fops->owner);
1039 put_disk(disk);
1040 bdev->bd_disk = NULL;
1041 mutex_unlock(&bdev->bd_mutex);
1042 goto restart;
1043 }
1027 if (ret) 1044 if (ret)
1028 goto out_clear; 1045 goto out_clear;
1029 } 1046 }
@@ -1219,6 +1236,20 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
1219 return blkdev_ioctl(bdev, mode, cmd, arg); 1236 return blkdev_ioctl(bdev, mode, cmd, arg);
1220} 1237}
1221 1238
1239/*
1240 * Try to release a page associated with block device when the system
1241 * is under memory pressure.
1242 */
1243static int blkdev_releasepage(struct page *page, gfp_t wait)
1244{
1245 struct super_block *super = BDEV_I(page->mapping->host)->bdev.bd_super;
1246
1247 if (super && super->s_op->bdev_try_to_free_page)
1248 return super->s_op->bdev_try_to_free_page(super, page, wait);
1249
1250 return try_to_free_buffers(page);
1251}
1252
1222static const struct address_space_operations def_blk_aops = { 1253static const struct address_space_operations def_blk_aops = {
1223 .readpage = blkdev_readpage, 1254 .readpage = blkdev_readpage,
1224 .writepage = blkdev_writepage, 1255 .writepage = blkdev_writepage,
@@ -1226,6 +1257,7 @@ static const struct address_space_operations def_blk_aops = {
1226 .write_begin = blkdev_write_begin, 1257 .write_begin = blkdev_write_begin,
1227 .write_end = blkdev_write_end, 1258 .write_end = blkdev_write_end,
1228 .writepages = generic_writepages, 1259 .writepages = generic_writepages,
1260 .releasepage = blkdev_releasepage,
1229 .direct_IO = blkdev_direct_IO, 1261 .direct_IO = blkdev_direct_IO,
1230}; 1262};
1231 1263
@@ -1261,7 +1293,7 @@ EXPORT_SYMBOL(ioctl_by_bdev);
1261 1293
1262/** 1294/**
1263 * lookup_bdev - lookup a struct block_device by name 1295 * lookup_bdev - lookup a struct block_device by name
1264 * @path: special file representing the block device 1296 * @pathname: special file representing the block device
1265 * 1297 *
1266 * Get a reference to the blockdevice at @pathname in the current 1298 * Get a reference to the blockdevice at @pathname in the current
1267 * namespace if possible and return it. Return ERR_PTR(error) 1299 * namespace if possible and return it. Return ERR_PTR(error)
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
new file mode 100644
index 000000000000..f8fcf999ea1b
--- /dev/null
+++ b/fs/btrfs/Kconfig
@@ -0,0 +1,18 @@
1config BTRFS_FS
2 tristate "Btrfs filesystem (EXPERIMENTAL) Unstable disk format"
3 depends on EXPERIMENTAL
4 select LIBCRC32C
5 select ZLIB_INFLATE
6 select ZLIB_DEFLATE
7 help
8 Btrfs is a new filesystem with extents, writable snapshotting,
9 support for multiple devices and many more features.
10
11 Btrfs is highly experimental, and THE DISK FORMAT IS NOT YET
12 FINALIZED. You should say N here unless you are interested in
13 testing Btrfs with non-critical data.
14
15 To compile this file system support as a module, choose M here. The
16 module will be called btrfs.
17
18 If unsure, say N.
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
new file mode 100644
index 000000000000..d2cf5a54a4b8
--- /dev/null
+++ b/fs/btrfs/Makefile
@@ -0,0 +1,25 @@
1ifneq ($(KERNELRELEASE),)
2# kbuild part of makefile
3
4obj-$(CONFIG_BTRFS_FS) := btrfs.o
5btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
6 file-item.o inode-item.o inode-map.o disk-io.o \
7 transaction.o inode.o file.o tree-defrag.o \
8 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
9 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
10 ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \
11 compression.o
12else
13
14# Normal Makefile
15
16KERNELDIR := /lib/modules/`uname -r`/build
17all:
18 $(MAKE) -C $(KERNELDIR) M=`pwd` CONFIG_BTRFS_FS=m modules
19
20modules_install:
21 $(MAKE) -C $(KERNELDIR) M=`pwd` modules_install
22clean:
23 $(MAKE) -C $(KERNELDIR) M=`pwd` clean
24
25endif
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
new file mode 100644
index 000000000000..1d53b62dbba5
--- /dev/null
+++ b/fs/btrfs/acl.c
@@ -0,0 +1,351 @@
1/*
2 * Copyright (C) 2007 Red Hat. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/fs.h>
20#include <linux/string.h>
21#include <linux/xattr.h>
22#include <linux/posix_acl_xattr.h>
23#include <linux/posix_acl.h>
24#include <linux/sched.h>
25
26#include "ctree.h"
27#include "btrfs_inode.h"
28#include "xattr.h"
29
30#ifdef CONFIG_FS_POSIX_ACL
31
32static void btrfs_update_cached_acl(struct inode *inode,
33 struct posix_acl **p_acl,
34 struct posix_acl *acl)
35{
36 spin_lock(&inode->i_lock);
37 if (*p_acl && *p_acl != BTRFS_ACL_NOT_CACHED)
38 posix_acl_release(*p_acl);
39 *p_acl = posix_acl_dup(acl);
40 spin_unlock(&inode->i_lock);
41}
42
43static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
44{
45 int size;
46 const char *name;
47 char *value = NULL;
48 struct posix_acl *acl = NULL, **p_acl;
49
50 switch (type) {
51 case ACL_TYPE_ACCESS:
52 name = POSIX_ACL_XATTR_ACCESS;
53 p_acl = &BTRFS_I(inode)->i_acl;
54 break;
55 case ACL_TYPE_DEFAULT:
56 name = POSIX_ACL_XATTR_DEFAULT;
57 p_acl = &BTRFS_I(inode)->i_default_acl;
58 break;
59 default:
60 return ERR_PTR(-EINVAL);
61 }
62
63 spin_lock(&inode->i_lock);
64 if (*p_acl != BTRFS_ACL_NOT_CACHED)
65 acl = posix_acl_dup(*p_acl);
66 spin_unlock(&inode->i_lock);
67
68 if (acl)
69 return acl;
70
71
72 size = __btrfs_getxattr(inode, name, "", 0);
73 if (size > 0) {
74 value = kzalloc(size, GFP_NOFS);
75 if (!value)
76 return ERR_PTR(-ENOMEM);
77 size = __btrfs_getxattr(inode, name, value, size);
78 if (size > 0) {
79 acl = posix_acl_from_xattr(value, size);
80 btrfs_update_cached_acl(inode, p_acl, acl);
81 }
82 kfree(value);
83 } else if (size == -ENOENT) {
84 acl = NULL;
85 btrfs_update_cached_acl(inode, p_acl, acl);
86 }
87
88 return acl;
89}
90
91static int btrfs_xattr_get_acl(struct inode *inode, int type,
92 void *value, size_t size)
93{
94 struct posix_acl *acl;
95 int ret = 0;
96
97 acl = btrfs_get_acl(inode, type);
98
99 if (IS_ERR(acl))
100 return PTR_ERR(acl);
101 if (acl == NULL)
102 return -ENODATA;
103 ret = posix_acl_to_xattr(acl, value, size);
104 posix_acl_release(acl);
105
106 return ret;
107}
108
109/*
110 * Needs to be called with fs_mutex held
111 */
112static int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
113{
114 int ret, size = 0;
115 const char *name;
116 struct posix_acl **p_acl;
117 char *value = NULL;
118 mode_t mode;
119
120 if (acl) {
121 ret = posix_acl_valid(acl);
122 if (ret < 0)
123 return ret;
124 ret = 0;
125 }
126
127 switch (type) {
128 case ACL_TYPE_ACCESS:
129 mode = inode->i_mode;
130 ret = posix_acl_equiv_mode(acl, &mode);
131 if (ret < 0)
132 return ret;
133 ret = 0;
134 inode->i_mode = mode;
135 name = POSIX_ACL_XATTR_ACCESS;
136 p_acl = &BTRFS_I(inode)->i_acl;
137 break;
138 case ACL_TYPE_DEFAULT:
139 if (!S_ISDIR(inode->i_mode))
140 return acl ? -EINVAL : 0;
141 name = POSIX_ACL_XATTR_DEFAULT;
142 p_acl = &BTRFS_I(inode)->i_default_acl;
143 break;
144 default:
145 return -EINVAL;
146 }
147
148 if (acl) {
149 size = posix_acl_xattr_size(acl->a_count);
150 value = kmalloc(size, GFP_NOFS);
151 if (!value) {
152 ret = -ENOMEM;
153 goto out;
154 }
155
156 ret = posix_acl_to_xattr(acl, value, size);
157 if (ret < 0)
158 goto out;
159 }
160
161 ret = __btrfs_setxattr(inode, name, value, size, 0);
162
163out:
164 kfree(value);
165
166 if (!ret)
167 btrfs_update_cached_acl(inode, p_acl, acl);
168
169 return ret;
170}
171
172static int btrfs_xattr_set_acl(struct inode *inode, int type,
173 const void *value, size_t size)
174{
175 int ret = 0;
176 struct posix_acl *acl = NULL;
177
178 if (value) {
179 acl = posix_acl_from_xattr(value, size);
180 if (acl == NULL) {
181 value = NULL;
182 size = 0;
183 } else if (IS_ERR(acl)) {
184 return PTR_ERR(acl);
185 }
186 }
187
188 ret = btrfs_set_acl(inode, acl, type);
189
190 posix_acl_release(acl);
191
192 return ret;
193}
194
195
196static int btrfs_xattr_acl_access_get(struct inode *inode, const char *name,
197 void *value, size_t size)
198{
199 return btrfs_xattr_get_acl(inode, ACL_TYPE_ACCESS, value, size);
200}
201
202static int btrfs_xattr_acl_access_set(struct inode *inode, const char *name,
203 const void *value, size_t size, int flags)
204{
205 return btrfs_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
206}
207
208static int btrfs_xattr_acl_default_get(struct inode *inode, const char *name,
209 void *value, size_t size)
210{
211 return btrfs_xattr_get_acl(inode, ACL_TYPE_DEFAULT, value, size);
212}
213
214static int btrfs_xattr_acl_default_set(struct inode *inode, const char *name,
215 const void *value, size_t size, int flags)
216{
217 return btrfs_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
218}
219
220int btrfs_check_acl(struct inode *inode, int mask)
221{
222 struct posix_acl *acl;
223 int error = -EAGAIN;
224
225 acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
226
227 if (IS_ERR(acl))
228 return PTR_ERR(acl);
229 if (acl) {
230 error = posix_acl_permission(inode, acl, mask);
231 posix_acl_release(acl);
232 }
233
234 return error;
235}
236
237/*
238 * btrfs_init_acl is already generally called under fs_mutex, so the locking
239 * stuff has been fixed to work with that. If the locking stuff changes, we
240 * need to re-evaluate the acl locking stuff.
241 */
242int btrfs_init_acl(struct inode *inode, struct inode *dir)
243{
244 struct posix_acl *acl = NULL;
245 int ret = 0;
246
247 /* this happens with subvols */
248 if (!dir)
249 return 0;
250
251 if (!S_ISLNK(inode->i_mode)) {
252 if (IS_POSIXACL(dir)) {
253 acl = btrfs_get_acl(dir, ACL_TYPE_DEFAULT);
254 if (IS_ERR(acl))
255 return PTR_ERR(acl);
256 }
257
258 if (!acl)
259 inode->i_mode &= ~current->fs->umask;
260 }
261
262 if (IS_POSIXACL(dir) && acl) {
263 struct posix_acl *clone;
264 mode_t mode;
265
266 if (S_ISDIR(inode->i_mode)) {
267 ret = btrfs_set_acl(inode, acl, ACL_TYPE_DEFAULT);
268 if (ret)
269 goto failed;
270 }
271 clone = posix_acl_clone(acl, GFP_NOFS);
272 ret = -ENOMEM;
273 if (!clone)
274 goto failed;
275
276 mode = inode->i_mode;
277 ret = posix_acl_create_masq(clone, &mode);
278 if (ret >= 0) {
279 inode->i_mode = mode;
280 if (ret > 0) {
281 /* we need an acl */
282 ret = btrfs_set_acl(inode, clone,
283 ACL_TYPE_ACCESS);
284 }
285 }
286 }
287failed:
288 posix_acl_release(acl);
289
290 return ret;
291}
292
293int btrfs_acl_chmod(struct inode *inode)
294{
295 struct posix_acl *acl, *clone;
296 int ret = 0;
297
298 if (S_ISLNK(inode->i_mode))
299 return -EOPNOTSUPP;
300
301 if (!IS_POSIXACL(inode))
302 return 0;
303
304 acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
305 if (IS_ERR(acl) || !acl)
306 return PTR_ERR(acl);
307
308 clone = posix_acl_clone(acl, GFP_KERNEL);
309 posix_acl_release(acl);
310 if (!clone)
311 return -ENOMEM;
312
313 ret = posix_acl_chmod_masq(clone, inode->i_mode);
314 if (!ret)
315 ret = btrfs_set_acl(inode, clone, ACL_TYPE_ACCESS);
316
317 posix_acl_release(clone);
318
319 return ret;
320}
321
322struct xattr_handler btrfs_xattr_acl_default_handler = {
323 .prefix = POSIX_ACL_XATTR_DEFAULT,
324 .get = btrfs_xattr_acl_default_get,
325 .set = btrfs_xattr_acl_default_set,
326};
327
328struct xattr_handler btrfs_xattr_acl_access_handler = {
329 .prefix = POSIX_ACL_XATTR_ACCESS,
330 .get = btrfs_xattr_acl_access_get,
331 .set = btrfs_xattr_acl_access_set,
332};
333
334#else /* CONFIG_FS_POSIX_ACL */
335
336int btrfs_acl_chmod(struct inode *inode)
337{
338 return 0;
339}
340
341int btrfs_init_acl(struct inode *inode, struct inode *dir)
342{
343 return 0;
344}
345
346int btrfs_check_acl(struct inode *inode, int mask)
347{
348 return 0;
349}
350
351#endif /* CONFIG_FS_POSIX_ACL */
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
new file mode 100644
index 000000000000..8e2fec05dbe0
--- /dev/null
+++ b/fs/btrfs/async-thread.c
@@ -0,0 +1,419 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/version.h>
20#include <linux/kthread.h>
21#include <linux/list.h>
22#include <linux/spinlock.h>
23# include <linux/freezer.h>
24#include "async-thread.h"
25
26#define WORK_QUEUED_BIT 0
27#define WORK_DONE_BIT 1
28#define WORK_ORDER_DONE_BIT 2
29
30/*
31 * container for the kthread task pointer and the list of pending work
32 * One of these is allocated per thread.
33 */
34struct btrfs_worker_thread {
35 /* pool we belong to */
36 struct btrfs_workers *workers;
37
38 /* list of struct btrfs_work that are waiting for service */
39 struct list_head pending;
40
41 /* list of worker threads from struct btrfs_workers */
42 struct list_head worker_list;
43
44 /* kthread */
45 struct task_struct *task;
46
47 /* number of things on the pending list */
48 atomic_t num_pending;
49
50 unsigned long sequence;
51
52 /* protects the pending list. */
53 spinlock_t lock;
54
55 /* set to non-zero when this thread is already awake and kicking */
56 int working;
57
58 /* are we currently idle */
59 int idle;
60};
61
62/*
63 * helper function to move a thread onto the idle list after it
64 * has finished some requests.
65 */
66static void check_idle_worker(struct btrfs_worker_thread *worker)
67{
68 if (!worker->idle && atomic_read(&worker->num_pending) <
69 worker->workers->idle_thresh / 2) {
70 unsigned long flags;
71 spin_lock_irqsave(&worker->workers->lock, flags);
72 worker->idle = 1;
73 list_move(&worker->worker_list, &worker->workers->idle_list);
74 spin_unlock_irqrestore(&worker->workers->lock, flags);
75 }
76}
77
78/*
79 * helper function to move a thread off the idle list after new
80 * pending work is added.
81 */
82static void check_busy_worker(struct btrfs_worker_thread *worker)
83{
84 if (worker->idle && atomic_read(&worker->num_pending) >=
85 worker->workers->idle_thresh) {
86 unsigned long flags;
87 spin_lock_irqsave(&worker->workers->lock, flags);
88 worker->idle = 0;
89 list_move_tail(&worker->worker_list,
90 &worker->workers->worker_list);
91 spin_unlock_irqrestore(&worker->workers->lock, flags);
92 }
93}
94
95static noinline int run_ordered_completions(struct btrfs_workers *workers,
96 struct btrfs_work *work)
97{
98 unsigned long flags;
99
100 if (!workers->ordered)
101 return 0;
102
103 set_bit(WORK_DONE_BIT, &work->flags);
104
105 spin_lock_irqsave(&workers->lock, flags);
106
107 while (!list_empty(&workers->order_list)) {
108 work = list_entry(workers->order_list.next,
109 struct btrfs_work, order_list);
110
111 if (!test_bit(WORK_DONE_BIT, &work->flags))
112 break;
113
114 /* we are going to call the ordered done function, but
115 * we leave the work item on the list as a barrier so
116 * that later work items that are done don't have their
117 * functions called before this one returns
118 */
119 if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags))
120 break;
121
122 spin_unlock_irqrestore(&workers->lock, flags);
123
124 work->ordered_func(work);
125
126 /* now take the lock again and call the freeing code */
127 spin_lock_irqsave(&workers->lock, flags);
128 list_del(&work->order_list);
129 work->ordered_free(work);
130 }
131
132 spin_unlock_irqrestore(&workers->lock, flags);
133 return 0;
134}
135
136/*
137 * main loop for servicing work items
138 */
139static int worker_loop(void *arg)
140{
141 struct btrfs_worker_thread *worker = arg;
142 struct list_head *cur;
143 struct btrfs_work *work;
144 do {
145 spin_lock_irq(&worker->lock);
146 while (!list_empty(&worker->pending)) {
147 cur = worker->pending.next;
148 work = list_entry(cur, struct btrfs_work, list);
149 list_del(&work->list);
150 clear_bit(WORK_QUEUED_BIT, &work->flags);
151
152 work->worker = worker;
153 spin_unlock_irq(&worker->lock);
154
155 work->func(work);
156
157 atomic_dec(&worker->num_pending);
158 /*
159 * unless this is an ordered work queue,
160 * 'work' was probably freed by func above.
161 */
162 run_ordered_completions(worker->workers, work);
163
164 spin_lock_irq(&worker->lock);
165 check_idle_worker(worker);
166
167 }
168 worker->working = 0;
169 if (freezing(current)) {
170 refrigerator();
171 } else {
172 set_current_state(TASK_INTERRUPTIBLE);
173 spin_unlock_irq(&worker->lock);
174 if (!kthread_should_stop())
175 schedule();
176 __set_current_state(TASK_RUNNING);
177 }
178 } while (!kthread_should_stop());
179 return 0;
180}
181
182/*
183 * this will wait for all the worker threads to shutdown
184 */
185int btrfs_stop_workers(struct btrfs_workers *workers)
186{
187 struct list_head *cur;
188 struct btrfs_worker_thread *worker;
189
190 list_splice_init(&workers->idle_list, &workers->worker_list);
191 while (!list_empty(&workers->worker_list)) {
192 cur = workers->worker_list.next;
193 worker = list_entry(cur, struct btrfs_worker_thread,
194 worker_list);
195 kthread_stop(worker->task);
196 list_del(&worker->worker_list);
197 kfree(worker);
198 }
199 return 0;
200}
201
202/*
203 * simple init on struct btrfs_workers
204 */
205void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max)
206{
207 workers->num_workers = 0;
208 INIT_LIST_HEAD(&workers->worker_list);
209 INIT_LIST_HEAD(&workers->idle_list);
210 INIT_LIST_HEAD(&workers->order_list);
211 spin_lock_init(&workers->lock);
212 workers->max_workers = max;
213 workers->idle_thresh = 32;
214 workers->name = name;
215 workers->ordered = 0;
216}
217
218/*
219 * starts new worker threads. This does not enforce the max worker
220 * count in case you need to temporarily go past it.
221 */
222int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
223{
224 struct btrfs_worker_thread *worker;
225 int ret = 0;
226 int i;
227
228 for (i = 0; i < num_workers; i++) {
229 worker = kzalloc(sizeof(*worker), GFP_NOFS);
230 if (!worker) {
231 ret = -ENOMEM;
232 goto fail;
233 }
234
235 INIT_LIST_HEAD(&worker->pending);
236 INIT_LIST_HEAD(&worker->worker_list);
237 spin_lock_init(&worker->lock);
238 atomic_set(&worker->num_pending, 0);
239 worker->task = kthread_run(worker_loop, worker,
240 "btrfs-%s-%d", workers->name,
241 workers->num_workers + i);
242 worker->workers = workers;
243 if (IS_ERR(worker->task)) {
244 kfree(worker);
245 ret = PTR_ERR(worker->task);
246 goto fail;
247 }
248
249 spin_lock_irq(&workers->lock);
250 list_add_tail(&worker->worker_list, &workers->idle_list);
251 worker->idle = 1;
252 workers->num_workers++;
253 spin_unlock_irq(&workers->lock);
254 }
255 return 0;
256fail:
257 btrfs_stop_workers(workers);
258 return ret;
259}
260
261/*
262 * run through the list and find a worker thread that doesn't have a lot
263 * to do right now. This can return null if we aren't yet at the thread
264 * count limit and all of the threads are busy.
265 */
266static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
267{
268 struct btrfs_worker_thread *worker;
269 struct list_head *next;
270 int enforce_min = workers->num_workers < workers->max_workers;
271
272 /*
273 * if we find an idle thread, don't move it to the end of the
274 * idle list. This improves the chance that the next submission
275 * will reuse the same thread, and maybe catch it while it is still
276 * working
277 */
278 if (!list_empty(&workers->idle_list)) {
279 next = workers->idle_list.next;
280 worker = list_entry(next, struct btrfs_worker_thread,
281 worker_list);
282 return worker;
283 }
284 if (enforce_min || list_empty(&workers->worker_list))
285 return NULL;
286
287 /*
288 * if we pick a busy task, move the task to the end of the list.
289 * hopefully this will keep things somewhat evenly balanced.
290 * Do the move in batches based on the sequence number. This groups
291 * requests submitted at roughly the same time onto the same worker.
292 */
293 next = workers->worker_list.next;
294 worker = list_entry(next, struct btrfs_worker_thread, worker_list);
295 atomic_inc(&worker->num_pending);
296 worker->sequence++;
297
298 if (worker->sequence % workers->idle_thresh == 0)
299 list_move_tail(next, &workers->worker_list);
300 return worker;
301}
302
303/*
304 * selects a worker thread to take the next job. This will either find
305 * an idle worker, start a new worker up to the max count, or just return
306 * one of the existing busy workers.
307 */
308static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
309{
310 struct btrfs_worker_thread *worker;
311 unsigned long flags;
312
313again:
314 spin_lock_irqsave(&workers->lock, flags);
315 worker = next_worker(workers);
316 spin_unlock_irqrestore(&workers->lock, flags);
317
318 if (!worker) {
319 spin_lock_irqsave(&workers->lock, flags);
320 if (workers->num_workers >= workers->max_workers) {
321 struct list_head *fallback = NULL;
322 /*
323 * we have failed to find any workers, just
324 * return the force one
325 */
326 if (!list_empty(&workers->worker_list))
327 fallback = workers->worker_list.next;
328 if (!list_empty(&workers->idle_list))
329 fallback = workers->idle_list.next;
330 BUG_ON(!fallback);
331 worker = list_entry(fallback,
332 struct btrfs_worker_thread, worker_list);
333 spin_unlock_irqrestore(&workers->lock, flags);
334 } else {
335 spin_unlock_irqrestore(&workers->lock, flags);
336 /* we're below the limit, start another worker */
337 btrfs_start_workers(workers, 1);
338 goto again;
339 }
340 }
341 return worker;
342}
343
344/*
345 * btrfs_requeue_work just puts the work item back on the tail of the list
346 * it was taken from. It is intended for use with long running work functions
347 * that make some progress and want to give the cpu up for others.
348 */
349int btrfs_requeue_work(struct btrfs_work *work)
350{
351 struct btrfs_worker_thread *worker = work->worker;
352 unsigned long flags;
353
354 if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
355 goto out;
356
357 spin_lock_irqsave(&worker->lock, flags);
358 atomic_inc(&worker->num_pending);
359 list_add_tail(&work->list, &worker->pending);
360
361 /* by definition we're busy, take ourselves off the idle
362 * list
363 */
364 if (worker->idle) {
365 spin_lock_irqsave(&worker->workers->lock, flags);
366 worker->idle = 0;
367 list_move_tail(&worker->worker_list,
368 &worker->workers->worker_list);
369 spin_unlock_irqrestore(&worker->workers->lock, flags);
370 }
371
372 spin_unlock_irqrestore(&worker->lock, flags);
373
374out:
375 return 0;
376}
377
378/*
379 * places a struct btrfs_work into the pending queue of one of the kthreads
380 */
381int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
382{
383 struct btrfs_worker_thread *worker;
384 unsigned long flags;
385 int wake = 0;
386
387 /* don't requeue something already on a list */
388 if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
389 goto out;
390
391 worker = find_worker(workers);
392 if (workers->ordered) {
393 spin_lock_irqsave(&workers->lock, flags);
394 list_add_tail(&work->order_list, &workers->order_list);
395 spin_unlock_irqrestore(&workers->lock, flags);
396 } else {
397 INIT_LIST_HEAD(&work->order_list);
398 }
399
400 spin_lock_irqsave(&worker->lock, flags);
401 atomic_inc(&worker->num_pending);
402 check_busy_worker(worker);
403 list_add_tail(&work->list, &worker->pending);
404
405 /*
406 * avoid calling into wake_up_process if this thread has already
407 * been kicked
408 */
409 if (!worker->working)
410 wake = 1;
411 worker->working = 1;
412
413 spin_unlock_irqrestore(&worker->lock, flags);
414
415 if (wake)
416 wake_up_process(worker->task);
417out:
418 return 0;
419}
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
new file mode 100644
index 000000000000..31be4ed8b63e
--- /dev/null
+++ b/fs/btrfs/async-thread.h
@@ -0,0 +1,101 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_ASYNC_THREAD_
20#define __BTRFS_ASYNC_THREAD_
21
22struct btrfs_worker_thread;
23
24/*
25 * This is similar to a workqueue, but it is meant to spread the operations
26 * across all available cpus instead of just the CPU that was used to
27 * queue the work. There is also some batching introduced to try and
28 * cut down on context switches.
29 *
30 * By default threads are added on demand up to 2 * the number of cpus.
31 * Changing struct btrfs_workers->max_workers is one way to prevent
32 * demand creation of kthreads.
33 *
34 * the basic model of these worker threads is to embed a btrfs_work
35 * structure in your own data struct, and use container_of in a
36 * work function to get back to your data struct.
37 */
38struct btrfs_work {
39 /*
40 * func should be set to the function you want called
41 * your work struct is passed as the only arg
42 *
43 * ordered_func must be set for work sent to an ordered work queue,
44 * and it is called to complete a given work item in the same
45 * order they were sent to the queue.
46 */
47 void (*func)(struct btrfs_work *work);
48 void (*ordered_func)(struct btrfs_work *work);
49 void (*ordered_free)(struct btrfs_work *work);
50
51 /*
52 * flags should be set to zero. It is used to make sure the
53 * struct is only inserted once into the list.
54 */
55 unsigned long flags;
56
57 /* don't touch these */
58 struct btrfs_worker_thread *worker;
59 struct list_head list;
60 struct list_head order_list;
61};
62
63struct btrfs_workers {
64 /* current number of running workers */
65 int num_workers;
66
67 /* max number of workers allowed. changed by btrfs_start_workers */
68 int max_workers;
69
70 /* once a worker has this many requests or fewer, it is idle */
71 int idle_thresh;
72
73 /* force completions in the order they were queued */
74 int ordered;
75
76 /* list with all the work threads. The workers on the idle thread
77 * may be actively servicing jobs, but they haven't yet hit the
78 * idle thresh limit above.
79 */
80 struct list_head worker_list;
81 struct list_head idle_list;
82
83 /*
84 * when operating in ordered mode, this maintains the list
85 * of work items waiting for completion
86 */
87 struct list_head order_list;
88
89 /* lock for finding the next worker thread to queue on */
90 spinlock_t lock;
91
92 /* extra name for this worker, used for current->name */
93 char *name;
94};
95
96int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);
97int btrfs_start_workers(struct btrfs_workers *workers, int num_workers);
98int btrfs_stop_workers(struct btrfs_workers *workers);
99void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max);
100int btrfs_requeue_work(struct btrfs_work *work);
101#endif
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
new file mode 100644
index 000000000000..a8c9693b75ac
--- /dev/null
+++ b/fs/btrfs/btrfs_inode.h
@@ -0,0 +1,131 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_I__
20#define __BTRFS_I__
21
22#include "extent_map.h"
23#include "extent_io.h"
24#include "ordered-data.h"
25
26/* in memory btrfs inode */
27struct btrfs_inode {
28 /* which subvolume this inode belongs to */
29 struct btrfs_root *root;
30
31 /* key used to find this inode on disk. This is used by the code
32 * to read in roots of subvolumes
33 */
34 struct btrfs_key location;
35
36 /* the extent_tree has caches of all the extent mappings to disk */
37 struct extent_map_tree extent_tree;
38
39 /* the io_tree does range state (DIRTY, LOCKED etc) */
40 struct extent_io_tree io_tree;
41
42 /* special utility tree used to record which mirrors have already been
43 * tried when checksums fail for a given block
44 */
45 struct extent_io_tree io_failure_tree;
46
47 /* held while inesrting or deleting extents from files */
48 struct mutex extent_mutex;
49
50 /* held while logging the inode in tree-log.c */
51 struct mutex log_mutex;
52
53 /* used to order data wrt metadata */
54 struct btrfs_ordered_inode_tree ordered_tree;
55
56 /* standard acl pointers */
57 struct posix_acl *i_acl;
58 struct posix_acl *i_default_acl;
59
60 /* for keeping track of orphaned inodes */
61 struct list_head i_orphan;
62
63 /* list of all the delalloc inodes in the FS. There are times we need
64 * to write all the delalloc pages to disk, and this list is used
65 * to walk them all.
66 */
67 struct list_head delalloc_inodes;
68
69 /* full 64 bit generation number, struct vfs_inode doesn't have a big
70 * enough field for this.
71 */
72 u64 generation;
73
74 /* sequence number for NFS changes */
75 u64 sequence;
76
77 /*
78 * transid of the trans_handle that last modified this inode
79 */
80 u64 last_trans;
81 /*
82 * transid that last logged this inode
83 */
84 u64 logged_trans;
85
86 /*
87 * trans that last made a change that should be fully fsync'd. This
88 * gets reset to zero each time the inode is logged
89 */
90 u64 log_dirty_trans;
91
92 /* total number of bytes pending delalloc, used by stat to calc the
93 * real block usage of the file
94 */
95 u64 delalloc_bytes;
96
97 /*
98 * the size of the file stored in the metadata on disk. data=ordered
99 * means the in-memory i_size might be larger than the size on disk
100 * because not all the blocks are written yet.
101 */
102 u64 disk_i_size;
103
104 /* flags field from the on disk inode */
105 u32 flags;
106
107 /*
108 * if this is a directory then index_cnt is the counter for the index
109 * number for new files that are created
110 */
111 u64 index_cnt;
112
113 /* the start of block group preferred for allocations. */
114 u64 block_group;
115
116 struct inode vfs_inode;
117};
118
119static inline struct btrfs_inode *BTRFS_I(struct inode *inode)
120{
121 return container_of(inode, struct btrfs_inode, vfs_inode);
122}
123
124static inline void btrfs_i_size_write(struct inode *inode, u64 size)
125{
126 inode->i_size = size;
127 BTRFS_I(inode)->disk_i_size = size;
128}
129
130
131#endif
diff --git a/fs/btrfs/compat.h b/fs/btrfs/compat.h
new file mode 100644
index 000000000000..7c4503ef6efd
--- /dev/null
+++ b/fs/btrfs/compat.h
@@ -0,0 +1,7 @@
1#ifndef _COMPAT_H_
2#define _COMPAT_H_
3
4#define btrfs_drop_nlink(inode) drop_nlink(inode)
5#define btrfs_inc_nlink(inode) inc_nlink(inode)
6
7#endif /* _COMPAT_H_ */
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
new file mode 100644
index 000000000000..ee848d8585d9
--- /dev/null
+++ b/fs/btrfs/compression.c
@@ -0,0 +1,709 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/kernel.h>
20#include <linux/bio.h>
21#include <linux/buffer_head.h>
22#include <linux/file.h>
23#include <linux/fs.h>
24#include <linux/pagemap.h>
25#include <linux/highmem.h>
26#include <linux/time.h>
27#include <linux/init.h>
28#include <linux/string.h>
29#include <linux/smp_lock.h>
30#include <linux/backing-dev.h>
31#include <linux/mpage.h>
32#include <linux/swap.h>
33#include <linux/writeback.h>
34#include <linux/bit_spinlock.h>
35#include <linux/version.h>
36#include <linux/pagevec.h>
37#include "compat.h"
38#include "ctree.h"
39#include "disk-io.h"
40#include "transaction.h"
41#include "btrfs_inode.h"
42#include "volumes.h"
43#include "ordered-data.h"
44#include "compression.h"
45#include "extent_io.h"
46#include "extent_map.h"
47
48struct compressed_bio {
49 /* number of bios pending for this compressed extent */
50 atomic_t pending_bios;
51
52 /* the pages with the compressed data on them */
53 struct page **compressed_pages;
54
55 /* inode that owns this data */
56 struct inode *inode;
57
58 /* starting offset in the inode for our pages */
59 u64 start;
60
61 /* number of bytes in the inode we're working on */
62 unsigned long len;
63
64 /* number of bytes on disk */
65 unsigned long compressed_len;
66
67 /* number of compressed pages in the array */
68 unsigned long nr_pages;
69
70 /* IO errors */
71 int errors;
72 int mirror_num;
73
74 /* for reads, this is the bio we are copying the data into */
75 struct bio *orig_bio;
76
77 /*
78 * the start of a variable length array of checksums only
79 * used by reads
80 */
81 u32 sums;
82};
83
84static inline int compressed_bio_size(struct btrfs_root *root,
85 unsigned long disk_size)
86{
87 u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy);
88 return sizeof(struct compressed_bio) +
89 ((disk_size + root->sectorsize - 1) / root->sectorsize) *
90 csum_size;
91}
92
93static struct bio *compressed_bio_alloc(struct block_device *bdev,
94 u64 first_byte, gfp_t gfp_flags)
95{
96 struct bio *bio;
97 int nr_vecs;
98
99 nr_vecs = bio_get_nr_vecs(bdev);
100 bio = bio_alloc(gfp_flags, nr_vecs);
101
102 if (bio == NULL && (current->flags & PF_MEMALLOC)) {
103 while (!bio && (nr_vecs /= 2))
104 bio = bio_alloc(gfp_flags, nr_vecs);
105 }
106
107 if (bio) {
108 bio->bi_size = 0;
109 bio->bi_bdev = bdev;
110 bio->bi_sector = first_byte >> 9;
111 }
112 return bio;
113}
114
115static int check_compressed_csum(struct inode *inode,
116 struct compressed_bio *cb,
117 u64 disk_start)
118{
119 int ret;
120 struct btrfs_root *root = BTRFS_I(inode)->root;
121 struct page *page;
122 unsigned long i;
123 char *kaddr;
124 u32 csum;
125 u32 *cb_sum = &cb->sums;
126
127 if (btrfs_test_flag(inode, NODATASUM))
128 return 0;
129
130 for (i = 0; i < cb->nr_pages; i++) {
131 page = cb->compressed_pages[i];
132 csum = ~(u32)0;
133
134 kaddr = kmap_atomic(page, KM_USER0);
135 csum = btrfs_csum_data(root, kaddr, csum, PAGE_CACHE_SIZE);
136 btrfs_csum_final(csum, (char *)&csum);
137 kunmap_atomic(kaddr, KM_USER0);
138
139 if (csum != *cb_sum) {
140 printk(KERN_INFO "btrfs csum failed ino %lu "
141 "extent %llu csum %u "
142 "wanted %u mirror %d\n", inode->i_ino,
143 (unsigned long long)disk_start,
144 csum, *cb_sum, cb->mirror_num);
145 ret = -EIO;
146 goto fail;
147 }
148 cb_sum++;
149
150 }
151 ret = 0;
152fail:
153 return ret;
154}
155
156/* when we finish reading compressed pages from the disk, we
157 * decompress them and then run the bio end_io routines on the
158 * decompressed pages (in the inode address space).
159 *
160 * This allows the checksumming and other IO error handling routines
161 * to work normally
162 *
163 * The compressed pages are freed here, and it must be run
164 * in process context
165 */
166static void end_compressed_bio_read(struct bio *bio, int err)
167{
168 struct extent_io_tree *tree;
169 struct compressed_bio *cb = bio->bi_private;
170 struct inode *inode;
171 struct page *page;
172 unsigned long index;
173 int ret;
174
175 if (err)
176 cb->errors = 1;
177
178 /* if there are more bios still pending for this compressed
179 * extent, just exit
180 */
181 if (!atomic_dec_and_test(&cb->pending_bios))
182 goto out;
183
184 inode = cb->inode;
185 ret = check_compressed_csum(inode, cb, (u64)bio->bi_sector << 9);
186 if (ret)
187 goto csum_failed;
188
189 /* ok, we're the last bio for this extent, lets start
190 * the decompression.
191 */
192 tree = &BTRFS_I(inode)->io_tree;
193 ret = btrfs_zlib_decompress_biovec(cb->compressed_pages,
194 cb->start,
195 cb->orig_bio->bi_io_vec,
196 cb->orig_bio->bi_vcnt,
197 cb->compressed_len);
198csum_failed:
199 if (ret)
200 cb->errors = 1;
201
202 /* release the compressed pages */
203 index = 0;
204 for (index = 0; index < cb->nr_pages; index++) {
205 page = cb->compressed_pages[index];
206 page->mapping = NULL;
207 page_cache_release(page);
208 }
209
210 /* do io completion on the original bio */
211 if (cb->errors) {
212 bio_io_error(cb->orig_bio);
213 } else {
214 int bio_index = 0;
215 struct bio_vec *bvec = cb->orig_bio->bi_io_vec;
216
217 /*
218 * we have verified the checksum already, set page
219 * checked so the end_io handlers know about it
220 */
221 while (bio_index < cb->orig_bio->bi_vcnt) {
222 SetPageChecked(bvec->bv_page);
223 bvec++;
224 bio_index++;
225 }
226 bio_endio(cb->orig_bio, 0);
227 }
228
229 /* finally free the cb struct */
230 kfree(cb->compressed_pages);
231 kfree(cb);
232out:
233 bio_put(bio);
234}
235
236/*
237 * Clear the writeback bits on all of the file
238 * pages for a compressed write
239 */
240static noinline int end_compressed_writeback(struct inode *inode, u64 start,
241 unsigned long ram_size)
242{
243 unsigned long index = start >> PAGE_CACHE_SHIFT;
244 unsigned long end_index = (start + ram_size - 1) >> PAGE_CACHE_SHIFT;
245 struct page *pages[16];
246 unsigned long nr_pages = end_index - index + 1;
247 int i;
248 int ret;
249
250 while (nr_pages > 0) {
251 ret = find_get_pages_contig(inode->i_mapping, index,
252 min_t(unsigned long,
253 nr_pages, ARRAY_SIZE(pages)), pages);
254 if (ret == 0) {
255 nr_pages -= 1;
256 index += 1;
257 continue;
258 }
259 for (i = 0; i < ret; i++) {
260 end_page_writeback(pages[i]);
261 page_cache_release(pages[i]);
262 }
263 nr_pages -= ret;
264 index += ret;
265 }
266 /* the inode may be gone now */
267 return 0;
268}
269
270/*
271 * do the cleanup once all the compressed pages hit the disk.
272 * This will clear writeback on the file pages and free the compressed
273 * pages.
274 *
275 * This also calls the writeback end hooks for the file pages so that
276 * metadata and checksums can be updated in the file.
277 */
278static void end_compressed_bio_write(struct bio *bio, int err)
279{
280 struct extent_io_tree *tree;
281 struct compressed_bio *cb = bio->bi_private;
282 struct inode *inode;
283 struct page *page;
284 unsigned long index;
285
286 if (err)
287 cb->errors = 1;
288
289 /* if there are more bios still pending for this compressed
290 * extent, just exit
291 */
292 if (!atomic_dec_and_test(&cb->pending_bios))
293 goto out;
294
295 /* ok, we're the last bio for this extent, step one is to
296 * call back into the FS and do all the end_io operations
297 */
298 inode = cb->inode;
299 tree = &BTRFS_I(inode)->io_tree;
300 cb->compressed_pages[0]->mapping = cb->inode->i_mapping;
301 tree->ops->writepage_end_io_hook(cb->compressed_pages[0],
302 cb->start,
303 cb->start + cb->len - 1,
304 NULL, 1);
305 cb->compressed_pages[0]->mapping = NULL;
306
307 end_compressed_writeback(inode, cb->start, cb->len);
308 /* note, our inode could be gone now */
309
310 /*
311 * release the compressed pages, these came from alloc_page and
312 * are not attached to the inode at all
313 */
314 index = 0;
315 for (index = 0; index < cb->nr_pages; index++) {
316 page = cb->compressed_pages[index];
317 page->mapping = NULL;
318 page_cache_release(page);
319 }
320
321 /* finally free the cb struct */
322 kfree(cb->compressed_pages);
323 kfree(cb);
324out:
325 bio_put(bio);
326}
327
328/*
329 * worker function to build and submit bios for previously compressed pages.
330 * The corresponding pages in the inode should be marked for writeback
331 * and the compressed pages should have a reference on them for dropping
332 * when the IO is complete.
333 *
334 * This also checksums the file bytes and gets things ready for
335 * the end io hooks.
336 */
337int btrfs_submit_compressed_write(struct inode *inode, u64 start,
338 unsigned long len, u64 disk_start,
339 unsigned long compressed_len,
340 struct page **compressed_pages,
341 unsigned long nr_pages)
342{
343 struct bio *bio = NULL;
344 struct btrfs_root *root = BTRFS_I(inode)->root;
345 struct compressed_bio *cb;
346 unsigned long bytes_left;
347 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
348 int page_index = 0;
349 struct page *page;
350 u64 first_byte = disk_start;
351 struct block_device *bdev;
352 int ret;
353
354 WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1));
355 cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
356 atomic_set(&cb->pending_bios, 0);
357 cb->errors = 0;
358 cb->inode = inode;
359 cb->start = start;
360 cb->len = len;
361 cb->mirror_num = 0;
362 cb->compressed_pages = compressed_pages;
363 cb->compressed_len = compressed_len;
364 cb->orig_bio = NULL;
365 cb->nr_pages = nr_pages;
366
367 bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
368
369 bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
370 bio->bi_private = cb;
371 bio->bi_end_io = end_compressed_bio_write;
372 atomic_inc(&cb->pending_bios);
373
374 /* create and submit bios for the compressed pages */
375 bytes_left = compressed_len;
376 for (page_index = 0; page_index < cb->nr_pages; page_index++) {
377 page = compressed_pages[page_index];
378 page->mapping = inode->i_mapping;
379 if (bio->bi_size)
380 ret = io_tree->ops->merge_bio_hook(page, 0,
381 PAGE_CACHE_SIZE,
382 bio, 0);
383 else
384 ret = 0;
385
386 page->mapping = NULL;
387 if (ret || bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) <
388 PAGE_CACHE_SIZE) {
389 bio_get(bio);
390
391 /*
392 * inc the count before we submit the bio so
393 * we know the end IO handler won't happen before
394 * we inc the count. Otherwise, the cb might get
395 * freed before we're done setting it up
396 */
397 atomic_inc(&cb->pending_bios);
398 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
399 BUG_ON(ret);
400
401 ret = btrfs_csum_one_bio(root, inode, bio, start, 1);
402 BUG_ON(ret);
403
404 ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
405 BUG_ON(ret);
406
407 bio_put(bio);
408
409 bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
410 bio->bi_private = cb;
411 bio->bi_end_io = end_compressed_bio_write;
412 bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
413 }
414 if (bytes_left < PAGE_CACHE_SIZE) {
415 printk("bytes left %lu compress len %lu nr %lu\n",
416 bytes_left, cb->compressed_len, cb->nr_pages);
417 }
418 bytes_left -= PAGE_CACHE_SIZE;
419 first_byte += PAGE_CACHE_SIZE;
420 cond_resched();
421 }
422 bio_get(bio);
423
424 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
425 BUG_ON(ret);
426
427 ret = btrfs_csum_one_bio(root, inode, bio, start, 1);
428 BUG_ON(ret);
429
430 ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
431 BUG_ON(ret);
432
433 bio_put(bio);
434 return 0;
435}
436
437static noinline int add_ra_bio_pages(struct inode *inode,
438 u64 compressed_end,
439 struct compressed_bio *cb)
440{
441 unsigned long end_index;
442 unsigned long page_index;
443 u64 last_offset;
444 u64 isize = i_size_read(inode);
445 int ret;
446 struct page *page;
447 unsigned long nr_pages = 0;
448 struct extent_map *em;
449 struct address_space *mapping = inode->i_mapping;
450 struct pagevec pvec;
451 struct extent_map_tree *em_tree;
452 struct extent_io_tree *tree;
453 u64 end;
454 int misses = 0;
455
456 page = cb->orig_bio->bi_io_vec[cb->orig_bio->bi_vcnt - 1].bv_page;
457 last_offset = (page_offset(page) + PAGE_CACHE_SIZE);
458 em_tree = &BTRFS_I(inode)->extent_tree;
459 tree = &BTRFS_I(inode)->io_tree;
460
461 if (isize == 0)
462 return 0;
463
464 end_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
465
466 pagevec_init(&pvec, 0);
467 while (last_offset < compressed_end) {
468 page_index = last_offset >> PAGE_CACHE_SHIFT;
469
470 if (page_index > end_index)
471 break;
472
473 rcu_read_lock();
474 page = radix_tree_lookup(&mapping->page_tree, page_index);
475 rcu_read_unlock();
476 if (page) {
477 misses++;
478 if (misses > 4)
479 break;
480 goto next;
481 }
482
483 page = alloc_page(mapping_gfp_mask(mapping) | GFP_NOFS);
484 if (!page)
485 break;
486
487 page->index = page_index;
488 /*
489 * what we want to do here is call add_to_page_cache_lru,
490 * but that isn't exported, so we reproduce it here
491 */
492 if (add_to_page_cache(page, mapping,
493 page->index, GFP_NOFS)) {
494 page_cache_release(page);
495 goto next;
496 }
497
498 /* open coding of lru_cache_add, also not exported */
499 page_cache_get(page);
500 if (!pagevec_add(&pvec, page))
501 __pagevec_lru_add_file(&pvec);
502
503 end = last_offset + PAGE_CACHE_SIZE - 1;
504 /*
505 * at this point, we have a locked page in the page cache
506 * for these bytes in the file. But, we have to make
507 * sure they map to this compressed extent on disk.
508 */
509 set_page_extent_mapped(page);
510 lock_extent(tree, last_offset, end, GFP_NOFS);
511 spin_lock(&em_tree->lock);
512 em = lookup_extent_mapping(em_tree, last_offset,
513 PAGE_CACHE_SIZE);
514 spin_unlock(&em_tree->lock);
515
516 if (!em || last_offset < em->start ||
517 (last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) ||
518 (em->block_start >> 9) != cb->orig_bio->bi_sector) {
519 free_extent_map(em);
520 unlock_extent(tree, last_offset, end, GFP_NOFS);
521 unlock_page(page);
522 page_cache_release(page);
523 break;
524 }
525 free_extent_map(em);
526
527 if (page->index == end_index) {
528 char *userpage;
529 size_t zero_offset = isize & (PAGE_CACHE_SIZE - 1);
530
531 if (zero_offset) {
532 int zeros;
533 zeros = PAGE_CACHE_SIZE - zero_offset;
534 userpage = kmap_atomic(page, KM_USER0);
535 memset(userpage + zero_offset, 0, zeros);
536 flush_dcache_page(page);
537 kunmap_atomic(userpage, KM_USER0);
538 }
539 }
540
541 ret = bio_add_page(cb->orig_bio, page,
542 PAGE_CACHE_SIZE, 0);
543
544 if (ret == PAGE_CACHE_SIZE) {
545 nr_pages++;
546 page_cache_release(page);
547 } else {
548 unlock_extent(tree, last_offset, end, GFP_NOFS);
549 unlock_page(page);
550 page_cache_release(page);
551 break;
552 }
553next:
554 last_offset += PAGE_CACHE_SIZE;
555 }
556 if (pagevec_count(&pvec))
557 __pagevec_lru_add_file(&pvec);
558 return 0;
559}
560
561/*
562 * for a compressed read, the bio we get passed has all the inode pages
563 * in it. We don't actually do IO on those pages but allocate new ones
564 * to hold the compressed pages on disk.
565 *
566 * bio->bi_sector points to the compressed extent on disk
567 * bio->bi_io_vec points to all of the inode pages
568 * bio->bi_vcnt is a count of pages
569 *
570 * After the compressed pages are read, we copy the bytes into the
571 * bio we were passed and then call the bio end_io calls
572 */
573int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
574 int mirror_num, unsigned long bio_flags)
575{
576 struct extent_io_tree *tree;
577 struct extent_map_tree *em_tree;
578 struct compressed_bio *cb;
579 struct btrfs_root *root = BTRFS_I(inode)->root;
580 unsigned long uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
581 unsigned long compressed_len;
582 unsigned long nr_pages;
583 unsigned long page_index;
584 struct page *page;
585 struct block_device *bdev;
586 struct bio *comp_bio;
587 u64 cur_disk_byte = (u64)bio->bi_sector << 9;
588 u64 em_len;
589 u64 em_start;
590 struct extent_map *em;
591 int ret;
592 u32 *sums;
593
594 tree = &BTRFS_I(inode)->io_tree;
595 em_tree = &BTRFS_I(inode)->extent_tree;
596
597 /* we need the actual starting offset of this extent in the file */
598 spin_lock(&em_tree->lock);
599 em = lookup_extent_mapping(em_tree,
600 page_offset(bio->bi_io_vec->bv_page),
601 PAGE_CACHE_SIZE);
602 spin_unlock(&em_tree->lock);
603
604 compressed_len = em->block_len;
605 cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
606 atomic_set(&cb->pending_bios, 0);
607 cb->errors = 0;
608 cb->inode = inode;
609 cb->mirror_num = mirror_num;
610 sums = &cb->sums;
611
612 cb->start = em->orig_start;
613 em_len = em->len;
614 em_start = em->start;
615
616 free_extent_map(em);
617 em = NULL;
618
619 cb->len = uncompressed_len;
620 cb->compressed_len = compressed_len;
621 cb->orig_bio = bio;
622
623 nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) /
624 PAGE_CACHE_SIZE;
625 cb->compressed_pages = kmalloc(sizeof(struct page *) * nr_pages,
626 GFP_NOFS);
627 bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
628
629 for (page_index = 0; page_index < nr_pages; page_index++) {
630 cb->compressed_pages[page_index] = alloc_page(GFP_NOFS |
631 __GFP_HIGHMEM);
632 }
633 cb->nr_pages = nr_pages;
634
635 add_ra_bio_pages(inode, em_start + em_len, cb);
636
637 /* include any pages we added in add_ra-bio_pages */
638 uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
639 cb->len = uncompressed_len;
640
641 comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS);
642 comp_bio->bi_private = cb;
643 comp_bio->bi_end_io = end_compressed_bio_read;
644 atomic_inc(&cb->pending_bios);
645
646 for (page_index = 0; page_index < nr_pages; page_index++) {
647 page = cb->compressed_pages[page_index];
648 page->mapping = inode->i_mapping;
649 page->index = em_start >> PAGE_CACHE_SHIFT;
650
651 if (comp_bio->bi_size)
652 ret = tree->ops->merge_bio_hook(page, 0,
653 PAGE_CACHE_SIZE,
654 comp_bio, 0);
655 else
656 ret = 0;
657
658 page->mapping = NULL;
659 if (ret || bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0) <
660 PAGE_CACHE_SIZE) {
661 bio_get(comp_bio);
662
663 ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
664 BUG_ON(ret);
665
666 /*
667 * inc the count before we submit the bio so
668 * we know the end IO handler won't happen before
669 * we inc the count. Otherwise, the cb might get
670 * freed before we're done setting it up
671 */
672 atomic_inc(&cb->pending_bios);
673
674 if (!btrfs_test_flag(inode, NODATASUM)) {
675 btrfs_lookup_bio_sums(root, inode, comp_bio,
676 sums);
677 }
678 sums += (comp_bio->bi_size + root->sectorsize - 1) /
679 root->sectorsize;
680
681 ret = btrfs_map_bio(root, READ, comp_bio,
682 mirror_num, 0);
683 BUG_ON(ret);
684
685 bio_put(comp_bio);
686
687 comp_bio = compressed_bio_alloc(bdev, cur_disk_byte,
688 GFP_NOFS);
689 comp_bio->bi_private = cb;
690 comp_bio->bi_end_io = end_compressed_bio_read;
691
692 bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0);
693 }
694 cur_disk_byte += PAGE_CACHE_SIZE;
695 }
696 bio_get(comp_bio);
697
698 ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
699 BUG_ON(ret);
700
701 if (!btrfs_test_flag(inode, NODATASUM))
702 btrfs_lookup_bio_sums(root, inode, comp_bio, sums);
703
704 ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0);
705 BUG_ON(ret);
706
707 bio_put(comp_bio);
708 return 0;
709}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
new file mode 100644
index 000000000000..421f5b4aa715
--- /dev/null
+++ b/fs/btrfs/compression.h
@@ -0,0 +1,47 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_COMPRESSION_
20#define __BTRFS_COMPRESSION_
21
22int btrfs_zlib_decompress(unsigned char *data_in,
23 struct page *dest_page,
24 unsigned long start_byte,
25 size_t srclen, size_t destlen);
26int btrfs_zlib_compress_pages(struct address_space *mapping,
27 u64 start, unsigned long len,
28 struct page **pages,
29 unsigned long nr_dest_pages,
30 unsigned long *out_pages,
31 unsigned long *total_in,
32 unsigned long *total_out,
33 unsigned long max_out);
34int btrfs_zlib_decompress_biovec(struct page **pages_in,
35 u64 disk_start,
36 struct bio_vec *bvec,
37 int vcnt,
38 size_t srclen);
39void btrfs_zlib_exit(void);
40int btrfs_submit_compressed_write(struct inode *inode, u64 start,
41 unsigned long len, u64 disk_start,
42 unsigned long compressed_len,
43 struct page **compressed_pages,
44 unsigned long nr_pages);
45int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
46 int mirror_num, unsigned long bio_flags);
47#endif
diff --git a/fs/btrfs/crc32c.h b/fs/btrfs/crc32c.h
new file mode 100644
index 000000000000..6e1b3de36700
--- /dev/null
+++ b/fs/btrfs/crc32c.h
@@ -0,0 +1,29 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_CRC32C__
20#define __BTRFS_CRC32C__
21#include <linux/crc32c.h>
22
23/*
24 * this file used to do more for selecting the HW version of crc32c,
25 * perhaps it will one day again soon.
26 */
27#define btrfs_crc32c(seed, data, length) crc32c(seed, data, length)
28#endif
29
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
new file mode 100644
index 000000000000..9e46c0776816
--- /dev/null
+++ b/fs/btrfs/ctree.c
@@ -0,0 +1,3953 @@
1/*
2 * Copyright (C) 2007,2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include "ctree.h"
21#include "disk-io.h"
22#include "transaction.h"
23#include "print-tree.h"
24#include "locking.h"
25
26static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
27 *root, struct btrfs_path *path, int level);
28static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
29 *root, struct btrfs_key *ins_key,
30 struct btrfs_path *path, int data_size, int extend);
31static int push_node_left(struct btrfs_trans_handle *trans,
32 struct btrfs_root *root, struct extent_buffer *dst,
33 struct extent_buffer *src, int empty);
34static int balance_node_right(struct btrfs_trans_handle *trans,
35 struct btrfs_root *root,
36 struct extent_buffer *dst_buf,
37 struct extent_buffer *src_buf);
38static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
39 struct btrfs_path *path, int level, int slot);
40
41inline void btrfs_init_path(struct btrfs_path *p)
42{
43 memset(p, 0, sizeof(*p));
44}
45
46struct btrfs_path *btrfs_alloc_path(void)
47{
48 struct btrfs_path *path;
49 path = kmem_cache_alloc(btrfs_path_cachep, GFP_NOFS);
50 if (path) {
51 btrfs_init_path(path);
52 path->reada = 1;
53 }
54 return path;
55}
56
57/* this also releases the path */
58void btrfs_free_path(struct btrfs_path *p)
59{
60 btrfs_release_path(NULL, p);
61 kmem_cache_free(btrfs_path_cachep, p);
62}
63
64/*
65 * path release drops references on the extent buffers in the path
66 * and it drops any locks held by this path
67 *
68 * It is safe to call this on paths that no locks or extent buffers held.
69 */
70noinline void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
71{
72 int i;
73
74 for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
75 p->slots[i] = 0;
76 if (!p->nodes[i])
77 continue;
78 if (p->locks[i]) {
79 btrfs_tree_unlock(p->nodes[i]);
80 p->locks[i] = 0;
81 }
82 free_extent_buffer(p->nodes[i]);
83 p->nodes[i] = NULL;
84 }
85}
86
87/*
88 * safely gets a reference on the root node of a tree. A lock
89 * is not taken, so a concurrent writer may put a different node
90 * at the root of the tree. See btrfs_lock_root_node for the
91 * looping required.
92 *
93 * The extent buffer returned by this has a reference taken, so
94 * it won't disappear. It may stop being the root of the tree
95 * at any time because there are no locks held.
96 */
97struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
98{
99 struct extent_buffer *eb;
100 spin_lock(&root->node_lock);
101 eb = root->node;
102 extent_buffer_get(eb);
103 spin_unlock(&root->node_lock);
104 return eb;
105}
106
107/* loop around taking references on and locking the root node of the
108 * tree until you end up with a lock on the root. A locked buffer
109 * is returned, with a reference held.
110 */
111struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
112{
113 struct extent_buffer *eb;
114
115 while (1) {
116 eb = btrfs_root_node(root);
117 btrfs_tree_lock(eb);
118
119 spin_lock(&root->node_lock);
120 if (eb == root->node) {
121 spin_unlock(&root->node_lock);
122 break;
123 }
124 spin_unlock(&root->node_lock);
125
126 btrfs_tree_unlock(eb);
127 free_extent_buffer(eb);
128 }
129 return eb;
130}
131
132/* cowonly root (everything not a reference counted cow subvolume), just get
133 * put onto a simple dirty list. transaction.c walks this to make sure they
134 * get properly updated on disk.
135 */
136static void add_root_to_dirty_list(struct btrfs_root *root)
137{
138 if (root->track_dirty && list_empty(&root->dirty_list)) {
139 list_add(&root->dirty_list,
140 &root->fs_info->dirty_cowonly_roots);
141 }
142}
143
144/*
145 * used by snapshot creation to make a copy of a root for a tree with
146 * a given objectid. The buffer with the new root node is returned in
147 * cow_ret, and this func returns zero on success or a negative error code.
148 */
149int btrfs_copy_root(struct btrfs_trans_handle *trans,
150 struct btrfs_root *root,
151 struct extent_buffer *buf,
152 struct extent_buffer **cow_ret, u64 new_root_objectid)
153{
154 struct extent_buffer *cow;
155 u32 nritems;
156 int ret = 0;
157 int level;
158 struct btrfs_root *new_root;
159
160 new_root = kmalloc(sizeof(*new_root), GFP_NOFS);
161 if (!new_root)
162 return -ENOMEM;
163
164 memcpy(new_root, root, sizeof(*new_root));
165 new_root->root_key.objectid = new_root_objectid;
166
167 WARN_ON(root->ref_cows && trans->transid !=
168 root->fs_info->running_transaction->transid);
169 WARN_ON(root->ref_cows && trans->transid != root->last_trans);
170
171 level = btrfs_header_level(buf);
172 nritems = btrfs_header_nritems(buf);
173
174 cow = btrfs_alloc_free_block(trans, new_root, buf->len, 0,
175 new_root_objectid, trans->transid,
176 level, buf->start, 0);
177 if (IS_ERR(cow)) {
178 kfree(new_root);
179 return PTR_ERR(cow);
180 }
181
182 copy_extent_buffer(cow, buf, 0, 0, cow->len);
183 btrfs_set_header_bytenr(cow, cow->start);
184 btrfs_set_header_generation(cow, trans->transid);
185 btrfs_set_header_owner(cow, new_root_objectid);
186 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
187
188 write_extent_buffer(cow, root->fs_info->fsid,
189 (unsigned long)btrfs_header_fsid(cow),
190 BTRFS_FSID_SIZE);
191
192 WARN_ON(btrfs_header_generation(buf) > trans->transid);
193 ret = btrfs_inc_ref(trans, new_root, buf, cow, NULL);
194 kfree(new_root);
195
196 if (ret)
197 return ret;
198
199 btrfs_mark_buffer_dirty(cow);
200 *cow_ret = cow;
201 return 0;
202}
203
204/*
205 * does the dirty work in cow of a single block. The parent block (if
206 * supplied) is updated to point to the new cow copy. The new buffer is marked
207 * dirty and returned locked. If you modify the block it needs to be marked
208 * dirty again.
209 *
210 * search_start -- an allocation hint for the new block
211 *
212 * empty_size -- a hint that you plan on doing more cow. This is the size in
213 * bytes the allocator should try to find free next to the block it returns.
214 * This is just a hint and may be ignored by the allocator.
215 *
216 * prealloc_dest -- if you have already reserved a destination for the cow,
217 * this uses that block instead of allocating a new one.
218 * btrfs_alloc_reserved_extent is used to finish the allocation.
219 */
220static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
221 struct btrfs_root *root,
222 struct extent_buffer *buf,
223 struct extent_buffer *parent, int parent_slot,
224 struct extent_buffer **cow_ret,
225 u64 search_start, u64 empty_size,
226 u64 prealloc_dest)
227{
228 u64 parent_start;
229 struct extent_buffer *cow;
230 u32 nritems;
231 int ret = 0;
232 int level;
233 int unlock_orig = 0;
234
235 if (*cow_ret == buf)
236 unlock_orig = 1;
237
238 WARN_ON(!btrfs_tree_locked(buf));
239
240 if (parent)
241 parent_start = parent->start;
242 else
243 parent_start = 0;
244
245 WARN_ON(root->ref_cows && trans->transid !=
246 root->fs_info->running_transaction->transid);
247 WARN_ON(root->ref_cows && trans->transid != root->last_trans);
248
249 level = btrfs_header_level(buf);
250 nritems = btrfs_header_nritems(buf);
251
252 if (prealloc_dest) {
253 struct btrfs_key ins;
254
255 ins.objectid = prealloc_dest;
256 ins.offset = buf->len;
257 ins.type = BTRFS_EXTENT_ITEM_KEY;
258
259 ret = btrfs_alloc_reserved_extent(trans, root, parent_start,
260 root->root_key.objectid,
261 trans->transid, level, &ins);
262 BUG_ON(ret);
263 cow = btrfs_init_new_buffer(trans, root, prealloc_dest,
264 buf->len);
265 } else {
266 cow = btrfs_alloc_free_block(trans, root, buf->len,
267 parent_start,
268 root->root_key.objectid,
269 trans->transid, level,
270 search_start, empty_size);
271 }
272 if (IS_ERR(cow))
273 return PTR_ERR(cow);
274
275 copy_extent_buffer(cow, buf, 0, 0, cow->len);
276 btrfs_set_header_bytenr(cow, cow->start);
277 btrfs_set_header_generation(cow, trans->transid);
278 btrfs_set_header_owner(cow, root->root_key.objectid);
279 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
280
281 write_extent_buffer(cow, root->fs_info->fsid,
282 (unsigned long)btrfs_header_fsid(cow),
283 BTRFS_FSID_SIZE);
284
285 WARN_ON(btrfs_header_generation(buf) > trans->transid);
286 if (btrfs_header_generation(buf) != trans->transid) {
287 u32 nr_extents;
288 ret = btrfs_inc_ref(trans, root, buf, cow, &nr_extents);
289 if (ret)
290 return ret;
291
292 ret = btrfs_cache_ref(trans, root, buf, nr_extents);
293 WARN_ON(ret);
294 } else if (btrfs_header_owner(buf) == BTRFS_TREE_RELOC_OBJECTID) {
295 /*
296 * There are only two places that can drop reference to
297 * tree blocks owned by living reloc trees, one is here,
298 * the other place is btrfs_drop_subtree. In both places,
299 * we check reference count while tree block is locked.
300 * Furthermore, if reference count is one, it won't get
301 * increased by someone else.
302 */
303 u32 refs;
304 ret = btrfs_lookup_extent_ref(trans, root, buf->start,
305 buf->len, &refs);
306 BUG_ON(ret);
307 if (refs == 1) {
308 ret = btrfs_update_ref(trans, root, buf, cow,
309 0, nritems);
310 clean_tree_block(trans, root, buf);
311 } else {
312 ret = btrfs_inc_ref(trans, root, buf, cow, NULL);
313 }
314 BUG_ON(ret);
315 } else {
316 ret = btrfs_update_ref(trans, root, buf, cow, 0, nritems);
317 if (ret)
318 return ret;
319 clean_tree_block(trans, root, buf);
320 }
321
322 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
323 ret = btrfs_reloc_tree_cache_ref(trans, root, cow, buf->start);
324 WARN_ON(ret);
325 }
326
327 if (buf == root->node) {
328 WARN_ON(parent && parent != buf);
329
330 spin_lock(&root->node_lock);
331 root->node = cow;
332 extent_buffer_get(cow);
333 spin_unlock(&root->node_lock);
334
335 if (buf != root->commit_root) {
336 btrfs_free_extent(trans, root, buf->start,
337 buf->len, buf->start,
338 root->root_key.objectid,
339 btrfs_header_generation(buf),
340 level, 1);
341 }
342 free_extent_buffer(buf);
343 add_root_to_dirty_list(root);
344 } else {
345 btrfs_set_node_blockptr(parent, parent_slot,
346 cow->start);
347 WARN_ON(trans->transid == 0);
348 btrfs_set_node_ptr_generation(parent, parent_slot,
349 trans->transid);
350 btrfs_mark_buffer_dirty(parent);
351 WARN_ON(btrfs_header_generation(parent) != trans->transid);
352 btrfs_free_extent(trans, root, buf->start, buf->len,
353 parent_start, btrfs_header_owner(parent),
354 btrfs_header_generation(parent), level, 1);
355 }
356 if (unlock_orig)
357 btrfs_tree_unlock(buf);
358 free_extent_buffer(buf);
359 btrfs_mark_buffer_dirty(cow);
360 *cow_ret = cow;
361 return 0;
362}
363
364/*
365 * cows a single block, see __btrfs_cow_block for the real work.
366 * This version of it has extra checks so that a block isn't cow'd more than
367 * once per transaction, as long as it hasn't been written yet
368 */
369noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
370 struct btrfs_root *root, struct extent_buffer *buf,
371 struct extent_buffer *parent, int parent_slot,
372 struct extent_buffer **cow_ret, u64 prealloc_dest)
373{
374 u64 search_start;
375 int ret;
376
377 if (trans->transaction != root->fs_info->running_transaction) {
378 printk(KERN_CRIT "trans %llu running %llu\n",
379 (unsigned long long)trans->transid,
380 (unsigned long long)
381 root->fs_info->running_transaction->transid);
382 WARN_ON(1);
383 }
384 if (trans->transid != root->fs_info->generation) {
385 printk(KERN_CRIT "trans %llu running %llu\n",
386 (unsigned long long)trans->transid,
387 (unsigned long long)root->fs_info->generation);
388 WARN_ON(1);
389 }
390
391 spin_lock(&root->fs_info->hash_lock);
392 if (btrfs_header_generation(buf) == trans->transid &&
393 btrfs_header_owner(buf) == root->root_key.objectid &&
394 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
395 *cow_ret = buf;
396 spin_unlock(&root->fs_info->hash_lock);
397 WARN_ON(prealloc_dest);
398 return 0;
399 }
400 spin_unlock(&root->fs_info->hash_lock);
401 search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
402 ret = __btrfs_cow_block(trans, root, buf, parent,
403 parent_slot, cow_ret, search_start, 0,
404 prealloc_dest);
405 return ret;
406}
407
408/*
409 * helper function for defrag to decide if two blocks pointed to by a
410 * node are actually close by
411 */
412static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
413{
414 if (blocknr < other && other - (blocknr + blocksize) < 32768)
415 return 1;
416 if (blocknr > other && blocknr - (other + blocksize) < 32768)
417 return 1;
418 return 0;
419}
420
421/*
422 * compare two keys in a memcmp fashion
423 */
424static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
425{
426 struct btrfs_key k1;
427
428 btrfs_disk_key_to_cpu(&k1, disk);
429
430 if (k1.objectid > k2->objectid)
431 return 1;
432 if (k1.objectid < k2->objectid)
433 return -1;
434 if (k1.type > k2->type)
435 return 1;
436 if (k1.type < k2->type)
437 return -1;
438 if (k1.offset > k2->offset)
439 return 1;
440 if (k1.offset < k2->offset)
441 return -1;
442 return 0;
443}
444
445/*
446 * same as comp_keys only with two btrfs_key's
447 */
448static int comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
449{
450 if (k1->objectid > k2->objectid)
451 return 1;
452 if (k1->objectid < k2->objectid)
453 return -1;
454 if (k1->type > k2->type)
455 return 1;
456 if (k1->type < k2->type)
457 return -1;
458 if (k1->offset > k2->offset)
459 return 1;
460 if (k1->offset < k2->offset)
461 return -1;
462 return 0;
463}
464
465/*
466 * this is used by the defrag code to go through all the
467 * leaves pointed to by a node and reallocate them so that
468 * disk order is close to key order
469 */
470int btrfs_realloc_node(struct btrfs_trans_handle *trans,
471 struct btrfs_root *root, struct extent_buffer *parent,
472 int start_slot, int cache_only, u64 *last_ret,
473 struct btrfs_key *progress)
474{
475 struct extent_buffer *cur;
476 u64 blocknr;
477 u64 gen;
478 u64 search_start = *last_ret;
479 u64 last_block = 0;
480 u64 other;
481 u32 parent_nritems;
482 int end_slot;
483 int i;
484 int err = 0;
485 int parent_level;
486 int uptodate;
487 u32 blocksize;
488 int progress_passed = 0;
489 struct btrfs_disk_key disk_key;
490
491 parent_level = btrfs_header_level(parent);
492 if (cache_only && parent_level != 1)
493 return 0;
494
495 if (trans->transaction != root->fs_info->running_transaction)
496 WARN_ON(1);
497 if (trans->transid != root->fs_info->generation)
498 WARN_ON(1);
499
500 parent_nritems = btrfs_header_nritems(parent);
501 blocksize = btrfs_level_size(root, parent_level - 1);
502 end_slot = parent_nritems;
503
504 if (parent_nritems == 1)
505 return 0;
506
507 for (i = start_slot; i < end_slot; i++) {
508 int close = 1;
509
510 if (!parent->map_token) {
511 map_extent_buffer(parent,
512 btrfs_node_key_ptr_offset(i),
513 sizeof(struct btrfs_key_ptr),
514 &parent->map_token, &parent->kaddr,
515 &parent->map_start, &parent->map_len,
516 KM_USER1);
517 }
518 btrfs_node_key(parent, &disk_key, i);
519 if (!progress_passed && comp_keys(&disk_key, progress) < 0)
520 continue;
521
522 progress_passed = 1;
523 blocknr = btrfs_node_blockptr(parent, i);
524 gen = btrfs_node_ptr_generation(parent, i);
525 if (last_block == 0)
526 last_block = blocknr;
527
528 if (i > 0) {
529 other = btrfs_node_blockptr(parent, i - 1);
530 close = close_blocks(blocknr, other, blocksize);
531 }
532 if (!close && i < end_slot - 2) {
533 other = btrfs_node_blockptr(parent, i + 1);
534 close = close_blocks(blocknr, other, blocksize);
535 }
536 if (close) {
537 last_block = blocknr;
538 continue;
539 }
540 if (parent->map_token) {
541 unmap_extent_buffer(parent, parent->map_token,
542 KM_USER1);
543 parent->map_token = NULL;
544 }
545
546 cur = btrfs_find_tree_block(root, blocknr, blocksize);
547 if (cur)
548 uptodate = btrfs_buffer_uptodate(cur, gen);
549 else
550 uptodate = 0;
551 if (!cur || !uptodate) {
552 if (cache_only) {
553 free_extent_buffer(cur);
554 continue;
555 }
556 if (!cur) {
557 cur = read_tree_block(root, blocknr,
558 blocksize, gen);
559 } else if (!uptodate) {
560 btrfs_read_buffer(cur, gen);
561 }
562 }
563 if (search_start == 0)
564 search_start = last_block;
565
566 btrfs_tree_lock(cur);
567 err = __btrfs_cow_block(trans, root, cur, parent, i,
568 &cur, search_start,
569 min(16 * blocksize,
570 (end_slot - i) * blocksize), 0);
571 if (err) {
572 btrfs_tree_unlock(cur);
573 free_extent_buffer(cur);
574 break;
575 }
576 search_start = cur->start;
577 last_block = cur->start;
578 *last_ret = search_start;
579 btrfs_tree_unlock(cur);
580 free_extent_buffer(cur);
581 }
582 if (parent->map_token) {
583 unmap_extent_buffer(parent, parent->map_token,
584 KM_USER1);
585 parent->map_token = NULL;
586 }
587 return err;
588}
589
590/*
591 * The leaf data grows from end-to-front in the node.
592 * this returns the address of the start of the last item,
593 * which is the stop of the leaf data stack
594 */
595static inline unsigned int leaf_data_end(struct btrfs_root *root,
596 struct extent_buffer *leaf)
597{
598 u32 nr = btrfs_header_nritems(leaf);
599 if (nr == 0)
600 return BTRFS_LEAF_DATA_SIZE(root);
601 return btrfs_item_offset_nr(leaf, nr - 1);
602}
603
604/*
605 * extra debugging checks to make sure all the items in a key are
606 * well formed and in the proper order
607 */
608static int check_node(struct btrfs_root *root, struct btrfs_path *path,
609 int level)
610{
611 struct extent_buffer *parent = NULL;
612 struct extent_buffer *node = path->nodes[level];
613 struct btrfs_disk_key parent_key;
614 struct btrfs_disk_key node_key;
615 int parent_slot;
616 int slot;
617 struct btrfs_key cpukey;
618 u32 nritems = btrfs_header_nritems(node);
619
620 if (path->nodes[level + 1])
621 parent = path->nodes[level + 1];
622
623 slot = path->slots[level];
624 BUG_ON(nritems == 0);
625 if (parent) {
626 parent_slot = path->slots[level + 1];
627 btrfs_node_key(parent, &parent_key, parent_slot);
628 btrfs_node_key(node, &node_key, 0);
629 BUG_ON(memcmp(&parent_key, &node_key,
630 sizeof(struct btrfs_disk_key)));
631 BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
632 btrfs_header_bytenr(node));
633 }
634 BUG_ON(nritems > BTRFS_NODEPTRS_PER_BLOCK(root));
635 if (slot != 0) {
636 btrfs_node_key_to_cpu(node, &cpukey, slot - 1);
637 btrfs_node_key(node, &node_key, slot);
638 BUG_ON(comp_keys(&node_key, &cpukey) <= 0);
639 }
640 if (slot < nritems - 1) {
641 btrfs_node_key_to_cpu(node, &cpukey, slot + 1);
642 btrfs_node_key(node, &node_key, slot);
643 BUG_ON(comp_keys(&node_key, &cpukey) >= 0);
644 }
645 return 0;
646}
647
648/*
649 * extra checking to make sure all the items in a leaf are
650 * well formed and in the proper order
651 */
652static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
653 int level)
654{
655 struct extent_buffer *leaf = path->nodes[level];
656 struct extent_buffer *parent = NULL;
657 int parent_slot;
658 struct btrfs_key cpukey;
659 struct btrfs_disk_key parent_key;
660 struct btrfs_disk_key leaf_key;
661 int slot = path->slots[0];
662
663 u32 nritems = btrfs_header_nritems(leaf);
664
665 if (path->nodes[level + 1])
666 parent = path->nodes[level + 1];
667
668 if (nritems == 0)
669 return 0;
670
671 if (parent) {
672 parent_slot = path->slots[level + 1];
673 btrfs_node_key(parent, &parent_key, parent_slot);
674 btrfs_item_key(leaf, &leaf_key, 0);
675
676 BUG_ON(memcmp(&parent_key, &leaf_key,
677 sizeof(struct btrfs_disk_key)));
678 BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
679 btrfs_header_bytenr(leaf));
680 }
681 if (slot != 0 && slot < nritems - 1) {
682 btrfs_item_key(leaf, &leaf_key, slot);
683 btrfs_item_key_to_cpu(leaf, &cpukey, slot - 1);
684 if (comp_keys(&leaf_key, &cpukey) <= 0) {
685 btrfs_print_leaf(root, leaf);
686 printk(KERN_CRIT "slot %d offset bad key\n", slot);
687 BUG_ON(1);
688 }
689 if (btrfs_item_offset_nr(leaf, slot - 1) !=
690 btrfs_item_end_nr(leaf, slot)) {
691 btrfs_print_leaf(root, leaf);
692 printk(KERN_CRIT "slot %d offset bad\n", slot);
693 BUG_ON(1);
694 }
695 }
696 if (slot < nritems - 1) {
697 btrfs_item_key(leaf, &leaf_key, slot);
698 btrfs_item_key_to_cpu(leaf, &cpukey, slot + 1);
699 BUG_ON(comp_keys(&leaf_key, &cpukey) >= 0);
700 if (btrfs_item_offset_nr(leaf, slot) !=
701 btrfs_item_end_nr(leaf, slot + 1)) {
702 btrfs_print_leaf(root, leaf);
703 printk(KERN_CRIT "slot %d offset bad\n", slot);
704 BUG_ON(1);
705 }
706 }
707 BUG_ON(btrfs_item_offset_nr(leaf, 0) +
708 btrfs_item_size_nr(leaf, 0) != BTRFS_LEAF_DATA_SIZE(root));
709 return 0;
710}
711
712static noinline int check_block(struct btrfs_root *root,
713 struct btrfs_path *path, int level)
714{
715 return 0;
716 if (level == 0)
717 return check_leaf(root, path, level);
718 return check_node(root, path, level);
719}
720
721/*
722 * search for key in the extent_buffer. The items start at offset p,
723 * and they are item_size apart. There are 'max' items in p.
724 *
725 * the slot in the array is returned via slot, and it points to
726 * the place where you would insert key if it is not found in
727 * the array.
728 *
729 * slot may point to max if the key is bigger than all of the keys
730 */
731static noinline int generic_bin_search(struct extent_buffer *eb,
732 unsigned long p,
733 int item_size, struct btrfs_key *key,
734 int max, int *slot)
735{
736 int low = 0;
737 int high = max;
738 int mid;
739 int ret;
740 struct btrfs_disk_key *tmp = NULL;
741 struct btrfs_disk_key unaligned;
742 unsigned long offset;
743 char *map_token = NULL;
744 char *kaddr = NULL;
745 unsigned long map_start = 0;
746 unsigned long map_len = 0;
747 int err;
748
749 while (low < high) {
750 mid = (low + high) / 2;
751 offset = p + mid * item_size;
752
753 if (!map_token || offset < map_start ||
754 (offset + sizeof(struct btrfs_disk_key)) >
755 map_start + map_len) {
756 if (map_token) {
757 unmap_extent_buffer(eb, map_token, KM_USER0);
758 map_token = NULL;
759 }
760
761 err = map_private_extent_buffer(eb, offset,
762 sizeof(struct btrfs_disk_key),
763 &map_token, &kaddr,
764 &map_start, &map_len, KM_USER0);
765
766 if (!err) {
767 tmp = (struct btrfs_disk_key *)(kaddr + offset -
768 map_start);
769 } else {
770 read_extent_buffer(eb, &unaligned,
771 offset, sizeof(unaligned));
772 tmp = &unaligned;
773 }
774
775 } else {
776 tmp = (struct btrfs_disk_key *)(kaddr + offset -
777 map_start);
778 }
779 ret = comp_keys(tmp, key);
780
781 if (ret < 0)
782 low = mid + 1;
783 else if (ret > 0)
784 high = mid;
785 else {
786 *slot = mid;
787 if (map_token)
788 unmap_extent_buffer(eb, map_token, KM_USER0);
789 return 0;
790 }
791 }
792 *slot = low;
793 if (map_token)
794 unmap_extent_buffer(eb, map_token, KM_USER0);
795 return 1;
796}
797
798/*
799 * simple bin_search frontend that does the right thing for
800 * leaves vs nodes
801 */
802static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
803 int level, int *slot)
804{
805 if (level == 0) {
806 return generic_bin_search(eb,
807 offsetof(struct btrfs_leaf, items),
808 sizeof(struct btrfs_item),
809 key, btrfs_header_nritems(eb),
810 slot);
811 } else {
812 return generic_bin_search(eb,
813 offsetof(struct btrfs_node, ptrs),
814 sizeof(struct btrfs_key_ptr),
815 key, btrfs_header_nritems(eb),
816 slot);
817 }
818 return -1;
819}
820
821/* given a node and slot number, this reads the blocks it points to. The
822 * extent buffer is returned with a reference taken (but unlocked).
823 * NULL is returned on error.
824 */
825static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
826 struct extent_buffer *parent, int slot)
827{
828 int level = btrfs_header_level(parent);
829 if (slot < 0)
830 return NULL;
831 if (slot >= btrfs_header_nritems(parent))
832 return NULL;
833
834 BUG_ON(level == 0);
835
836 return read_tree_block(root, btrfs_node_blockptr(parent, slot),
837 btrfs_level_size(root, level - 1),
838 btrfs_node_ptr_generation(parent, slot));
839}
840
841/*
842 * node level balancing, used to make sure nodes are in proper order for
843 * item deletion. We balance from the top down, so we have to make sure
844 * that a deletion won't leave an node completely empty later on.
845 */
846static noinline int balance_level(struct btrfs_trans_handle *trans,
847 struct btrfs_root *root,
848 struct btrfs_path *path, int level)
849{
850 struct extent_buffer *right = NULL;
851 struct extent_buffer *mid;
852 struct extent_buffer *left = NULL;
853 struct extent_buffer *parent = NULL;
854 int ret = 0;
855 int wret;
856 int pslot;
857 int orig_slot = path->slots[level];
858 int err_on_enospc = 0;
859 u64 orig_ptr;
860
861 if (level == 0)
862 return 0;
863
864 mid = path->nodes[level];
865 WARN_ON(!path->locks[level]);
866 WARN_ON(btrfs_header_generation(mid) != trans->transid);
867
868 orig_ptr = btrfs_node_blockptr(mid, orig_slot);
869
870 if (level < BTRFS_MAX_LEVEL - 1)
871 parent = path->nodes[level + 1];
872 pslot = path->slots[level + 1];
873
874 /*
875 * deal with the case where there is only one pointer in the root
876 * by promoting the node below to a root
877 */
878 if (!parent) {
879 struct extent_buffer *child;
880
881 if (btrfs_header_nritems(mid) != 1)
882 return 0;
883
884 /* promote the child to a root */
885 child = read_node_slot(root, mid, 0);
886 btrfs_tree_lock(child);
887 BUG_ON(!child);
888 ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0);
889 BUG_ON(ret);
890
891 spin_lock(&root->node_lock);
892 root->node = child;
893 spin_unlock(&root->node_lock);
894
895 ret = btrfs_update_extent_ref(trans, root, child->start,
896 mid->start, child->start,
897 root->root_key.objectid,
898 trans->transid, level - 1);
899 BUG_ON(ret);
900
901 add_root_to_dirty_list(root);
902 btrfs_tree_unlock(child);
903 path->locks[level] = 0;
904 path->nodes[level] = NULL;
905 clean_tree_block(trans, root, mid);
906 btrfs_tree_unlock(mid);
907 /* once for the path */
908 free_extent_buffer(mid);
909 ret = btrfs_free_extent(trans, root, mid->start, mid->len,
910 mid->start, root->root_key.objectid,
911 btrfs_header_generation(mid),
912 level, 1);
913 /* once for the root ptr */
914 free_extent_buffer(mid);
915 return ret;
916 }
917 if (btrfs_header_nritems(mid) >
918 BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
919 return 0;
920
921 if (btrfs_header_nritems(mid) < 2)
922 err_on_enospc = 1;
923
924 left = read_node_slot(root, parent, pslot - 1);
925 if (left) {
926 btrfs_tree_lock(left);
927 wret = btrfs_cow_block(trans, root, left,
928 parent, pslot - 1, &left, 0);
929 if (wret) {
930 ret = wret;
931 goto enospc;
932 }
933 }
934 right = read_node_slot(root, parent, pslot + 1);
935 if (right) {
936 btrfs_tree_lock(right);
937 wret = btrfs_cow_block(trans, root, right,
938 parent, pslot + 1, &right, 0);
939 if (wret) {
940 ret = wret;
941 goto enospc;
942 }
943 }
944
945 /* first, try to make some room in the middle buffer */
946 if (left) {
947 orig_slot += btrfs_header_nritems(left);
948 wret = push_node_left(trans, root, left, mid, 1);
949 if (wret < 0)
950 ret = wret;
951 if (btrfs_header_nritems(mid) < 2)
952 err_on_enospc = 1;
953 }
954
955 /*
956 * then try to empty the right most buffer into the middle
957 */
958 if (right) {
959 wret = push_node_left(trans, root, mid, right, 1);
960 if (wret < 0 && wret != -ENOSPC)
961 ret = wret;
962 if (btrfs_header_nritems(right) == 0) {
963 u64 bytenr = right->start;
964 u64 generation = btrfs_header_generation(parent);
965 u32 blocksize = right->len;
966
967 clean_tree_block(trans, root, right);
968 btrfs_tree_unlock(right);
969 free_extent_buffer(right);
970 right = NULL;
971 wret = del_ptr(trans, root, path, level + 1, pslot +
972 1);
973 if (wret)
974 ret = wret;
975 wret = btrfs_free_extent(trans, root, bytenr,
976 blocksize, parent->start,
977 btrfs_header_owner(parent),
978 generation, level, 1);
979 if (wret)
980 ret = wret;
981 } else {
982 struct btrfs_disk_key right_key;
983 btrfs_node_key(right, &right_key, 0);
984 btrfs_set_node_key(parent, &right_key, pslot + 1);
985 btrfs_mark_buffer_dirty(parent);
986 }
987 }
988 if (btrfs_header_nritems(mid) == 1) {
989 /*
990 * we're not allowed to leave a node with one item in the
991 * tree during a delete. A deletion from lower in the tree
992 * could try to delete the only pointer in this node.
993 * So, pull some keys from the left.
994 * There has to be a left pointer at this point because
995 * otherwise we would have pulled some pointers from the
996 * right
997 */
998 BUG_ON(!left);
999 wret = balance_node_right(trans, root, mid, left);
1000 if (wret < 0) {
1001 ret = wret;
1002 goto enospc;
1003 }
1004 if (wret == 1) {
1005 wret = push_node_left(trans, root, left, mid, 1);
1006 if (wret < 0)
1007 ret = wret;
1008 }
1009 BUG_ON(wret == 1);
1010 }
1011 if (btrfs_header_nritems(mid) == 0) {
1012 /* we've managed to empty the middle node, drop it */
1013 u64 root_gen = btrfs_header_generation(parent);
1014 u64 bytenr = mid->start;
1015 u32 blocksize = mid->len;
1016
1017 clean_tree_block(trans, root, mid);
1018 btrfs_tree_unlock(mid);
1019 free_extent_buffer(mid);
1020 mid = NULL;
1021 wret = del_ptr(trans, root, path, level + 1, pslot);
1022 if (wret)
1023 ret = wret;
1024 wret = btrfs_free_extent(trans, root, bytenr, blocksize,
1025 parent->start,
1026 btrfs_header_owner(parent),
1027 root_gen, level, 1);
1028 if (wret)
1029 ret = wret;
1030 } else {
1031 /* update the parent key to reflect our changes */
1032 struct btrfs_disk_key mid_key;
1033 btrfs_node_key(mid, &mid_key, 0);
1034 btrfs_set_node_key(parent, &mid_key, pslot);
1035 btrfs_mark_buffer_dirty(parent);
1036 }
1037
1038 /* update the path */
1039 if (left) {
1040 if (btrfs_header_nritems(left) > orig_slot) {
1041 extent_buffer_get(left);
1042 /* left was locked after cow */
1043 path->nodes[level] = left;
1044 path->slots[level + 1] -= 1;
1045 path->slots[level] = orig_slot;
1046 if (mid) {
1047 btrfs_tree_unlock(mid);
1048 free_extent_buffer(mid);
1049 }
1050 } else {
1051 orig_slot -= btrfs_header_nritems(left);
1052 path->slots[level] = orig_slot;
1053 }
1054 }
1055 /* double check we haven't messed things up */
1056 check_block(root, path, level);
1057 if (orig_ptr !=
1058 btrfs_node_blockptr(path->nodes[level], path->slots[level]))
1059 BUG();
1060enospc:
1061 if (right) {
1062 btrfs_tree_unlock(right);
1063 free_extent_buffer(right);
1064 }
1065 if (left) {
1066 if (path->nodes[level] != left)
1067 btrfs_tree_unlock(left);
1068 free_extent_buffer(left);
1069 }
1070 return ret;
1071}
1072
1073/* Node balancing for insertion. Here we only split or push nodes around
1074 * when they are completely full. This is also done top down, so we
1075 * have to be pessimistic.
1076 */
1077static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1078 struct btrfs_root *root,
1079 struct btrfs_path *path, int level)
1080{
1081 struct extent_buffer *right = NULL;
1082 struct extent_buffer *mid;
1083 struct extent_buffer *left = NULL;
1084 struct extent_buffer *parent = NULL;
1085 int ret = 0;
1086 int wret;
1087 int pslot;
1088 int orig_slot = path->slots[level];
1089 u64 orig_ptr;
1090
1091 if (level == 0)
1092 return 1;
1093
1094 mid = path->nodes[level];
1095 WARN_ON(btrfs_header_generation(mid) != trans->transid);
1096 orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1097
1098 if (level < BTRFS_MAX_LEVEL - 1)
1099 parent = path->nodes[level + 1];
1100 pslot = path->slots[level + 1];
1101
1102 if (!parent)
1103 return 1;
1104
1105 left = read_node_slot(root, parent, pslot - 1);
1106
1107 /* first, try to make some room in the middle buffer */
1108 if (left) {
1109 u32 left_nr;
1110
1111 btrfs_tree_lock(left);
1112 left_nr = btrfs_header_nritems(left);
1113 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
1114 wret = 1;
1115 } else {
1116 ret = btrfs_cow_block(trans, root, left, parent,
1117 pslot - 1, &left, 0);
1118 if (ret)
1119 wret = 1;
1120 else {
1121 wret = push_node_left(trans, root,
1122 left, mid, 0);
1123 }
1124 }
1125 if (wret < 0)
1126 ret = wret;
1127 if (wret == 0) {
1128 struct btrfs_disk_key disk_key;
1129 orig_slot += left_nr;
1130 btrfs_node_key(mid, &disk_key, 0);
1131 btrfs_set_node_key(parent, &disk_key, pslot);
1132 btrfs_mark_buffer_dirty(parent);
1133 if (btrfs_header_nritems(left) > orig_slot) {
1134 path->nodes[level] = left;
1135 path->slots[level + 1] -= 1;
1136 path->slots[level] = orig_slot;
1137 btrfs_tree_unlock(mid);
1138 free_extent_buffer(mid);
1139 } else {
1140 orig_slot -=
1141 btrfs_header_nritems(left);
1142 path->slots[level] = orig_slot;
1143 btrfs_tree_unlock(left);
1144 free_extent_buffer(left);
1145 }
1146 return 0;
1147 }
1148 btrfs_tree_unlock(left);
1149 free_extent_buffer(left);
1150 }
1151 right = read_node_slot(root, parent, pslot + 1);
1152
1153 /*
1154 * then try to empty the right most buffer into the middle
1155 */
1156 if (right) {
1157 u32 right_nr;
1158 btrfs_tree_lock(right);
1159 right_nr = btrfs_header_nritems(right);
1160 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
1161 wret = 1;
1162 } else {
1163 ret = btrfs_cow_block(trans, root, right,
1164 parent, pslot + 1,
1165 &right, 0);
1166 if (ret)
1167 wret = 1;
1168 else {
1169 wret = balance_node_right(trans, root,
1170 right, mid);
1171 }
1172 }
1173 if (wret < 0)
1174 ret = wret;
1175 if (wret == 0) {
1176 struct btrfs_disk_key disk_key;
1177
1178 btrfs_node_key(right, &disk_key, 0);
1179 btrfs_set_node_key(parent, &disk_key, pslot + 1);
1180 btrfs_mark_buffer_dirty(parent);
1181
1182 if (btrfs_header_nritems(mid) <= orig_slot) {
1183 path->nodes[level] = right;
1184 path->slots[level + 1] += 1;
1185 path->slots[level] = orig_slot -
1186 btrfs_header_nritems(mid);
1187 btrfs_tree_unlock(mid);
1188 free_extent_buffer(mid);
1189 } else {
1190 btrfs_tree_unlock(right);
1191 free_extent_buffer(right);
1192 }
1193 return 0;
1194 }
1195 btrfs_tree_unlock(right);
1196 free_extent_buffer(right);
1197 }
1198 return 1;
1199}
1200
1201/*
1202 * readahead one full node of leaves, finding things that are close
1203 * to the block in 'slot', and triggering ra on them.
1204 */
1205static noinline void reada_for_search(struct btrfs_root *root,
1206 struct btrfs_path *path,
1207 int level, int slot, u64 objectid)
1208{
1209 struct extent_buffer *node;
1210 struct btrfs_disk_key disk_key;
1211 u32 nritems;
1212 u64 search;
1213 u64 lowest_read;
1214 u64 highest_read;
1215 u64 nread = 0;
1216 int direction = path->reada;
1217 struct extent_buffer *eb;
1218 u32 nr;
1219 u32 blocksize;
1220 u32 nscan = 0;
1221
1222 if (level != 1)
1223 return;
1224
1225 if (!path->nodes[level])
1226 return;
1227
1228 node = path->nodes[level];
1229
1230 search = btrfs_node_blockptr(node, slot);
1231 blocksize = btrfs_level_size(root, level - 1);
1232 eb = btrfs_find_tree_block(root, search, blocksize);
1233 if (eb) {
1234 free_extent_buffer(eb);
1235 return;
1236 }
1237
1238 highest_read = search;
1239 lowest_read = search;
1240
1241 nritems = btrfs_header_nritems(node);
1242 nr = slot;
1243 while (1) {
1244 if (direction < 0) {
1245 if (nr == 0)
1246 break;
1247 nr--;
1248 } else if (direction > 0) {
1249 nr++;
1250 if (nr >= nritems)
1251 break;
1252 }
1253 if (path->reada < 0 && objectid) {
1254 btrfs_node_key(node, &disk_key, nr);
1255 if (btrfs_disk_key_objectid(&disk_key) != objectid)
1256 break;
1257 }
1258 search = btrfs_node_blockptr(node, nr);
1259 if ((search >= lowest_read && search <= highest_read) ||
1260 (search < lowest_read && lowest_read - search <= 16384) ||
1261 (search > highest_read && search - highest_read <= 16384)) {
1262 readahead_tree_block(root, search, blocksize,
1263 btrfs_node_ptr_generation(node, nr));
1264 nread += blocksize;
1265 }
1266 nscan++;
1267 if (path->reada < 2 && (nread > (64 * 1024) || nscan > 32))
1268 break;
1269
1270 if (nread > (256 * 1024) || nscan > 128)
1271 break;
1272
1273 if (search < lowest_read)
1274 lowest_read = search;
1275 if (search > highest_read)
1276 highest_read = search;
1277 }
1278}
1279
1280/*
1281 * when we walk down the tree, it is usually safe to unlock the higher layers
1282 * in the tree. The exceptions are when our path goes through slot 0, because
1283 * operations on the tree might require changing key pointers higher up in the
1284 * tree.
1285 *
1286 * callers might also have set path->keep_locks, which tells this code to keep
1287 * the lock if the path points to the last slot in the block. This is part of
1288 * walking through the tree, and selecting the next slot in the higher block.
1289 *
1290 * lowest_unlock sets the lowest level in the tree we're allowed to unlock. so
1291 * if lowest_unlock is 1, level 0 won't be unlocked
1292 */
1293static noinline void unlock_up(struct btrfs_path *path, int level,
1294 int lowest_unlock)
1295{
1296 int i;
1297 int skip_level = level;
1298 int no_skips = 0;
1299 struct extent_buffer *t;
1300
1301 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1302 if (!path->nodes[i])
1303 break;
1304 if (!path->locks[i])
1305 break;
1306 if (!no_skips && path->slots[i] == 0) {
1307 skip_level = i + 1;
1308 continue;
1309 }
1310 if (!no_skips && path->keep_locks) {
1311 u32 nritems;
1312 t = path->nodes[i];
1313 nritems = btrfs_header_nritems(t);
1314 if (nritems < 1 || path->slots[i] >= nritems - 1) {
1315 skip_level = i + 1;
1316 continue;
1317 }
1318 }
1319 if (skip_level < i && i >= lowest_unlock)
1320 no_skips = 1;
1321
1322 t = path->nodes[i];
1323 if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
1324 btrfs_tree_unlock(t);
1325 path->locks[i] = 0;
1326 }
1327 }
1328}
1329
1330/*
1331 * look for key in the tree. path is filled in with nodes along the way
1332 * if key is found, we return zero and you can find the item in the leaf
1333 * level of the path (level 0)
1334 *
1335 * If the key isn't found, the path points to the slot where it should
1336 * be inserted, and 1 is returned. If there are other errors during the
1337 * search a negative error number is returned.
1338 *
1339 * if ins_len > 0, nodes and leaves will be split as we walk down the
1340 * tree. if ins_len < 0, nodes will be merged as we walk down the tree (if
1341 * possible)
1342 */
1343int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
1344 *root, struct btrfs_key *key, struct btrfs_path *p, int
1345 ins_len, int cow)
1346{
1347 struct extent_buffer *b;
1348 struct extent_buffer *tmp;
1349 int slot;
1350 int ret;
1351 int level;
1352 int should_reada = p->reada;
1353 int lowest_unlock = 1;
1354 int blocksize;
1355 u8 lowest_level = 0;
1356 u64 blocknr;
1357 u64 gen;
1358 struct btrfs_key prealloc_block;
1359
1360 lowest_level = p->lowest_level;
1361 WARN_ON(lowest_level && ins_len > 0);
1362 WARN_ON(p->nodes[0] != NULL);
1363
1364 if (ins_len < 0)
1365 lowest_unlock = 2;
1366
1367 prealloc_block.objectid = 0;
1368
1369again:
1370 if (p->skip_locking)
1371 b = btrfs_root_node(root);
1372 else
1373 b = btrfs_lock_root_node(root);
1374
1375 while (b) {
1376 level = btrfs_header_level(b);
1377
1378 /*
1379 * setup the path here so we can release it under lock
1380 * contention with the cow code
1381 */
1382 p->nodes[level] = b;
1383 if (!p->skip_locking)
1384 p->locks[level] = 1;
1385
1386 if (cow) {
1387 int wret;
1388
1389 /* is a cow on this block not required */
1390 spin_lock(&root->fs_info->hash_lock);
1391 if (btrfs_header_generation(b) == trans->transid &&
1392 btrfs_header_owner(b) == root->root_key.objectid &&
1393 !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) {
1394 spin_unlock(&root->fs_info->hash_lock);
1395 goto cow_done;
1396 }
1397 spin_unlock(&root->fs_info->hash_lock);
1398
1399 /* ok, we have to cow, is our old prealloc the right
1400 * size?
1401 */
1402 if (prealloc_block.objectid &&
1403 prealloc_block.offset != b->len) {
1404 btrfs_free_reserved_extent(root,
1405 prealloc_block.objectid,
1406 prealloc_block.offset);
1407 prealloc_block.objectid = 0;
1408 }
1409
1410 /*
1411 * for higher level blocks, try not to allocate blocks
1412 * with the block and the parent locks held.
1413 */
1414 if (level > 1 && !prealloc_block.objectid &&
1415 btrfs_path_lock_waiting(p, level)) {
1416 u32 size = b->len;
1417 u64 hint = b->start;
1418
1419 btrfs_release_path(root, p);
1420 ret = btrfs_reserve_extent(trans, root,
1421 size, size, 0,
1422 hint, (u64)-1,
1423 &prealloc_block, 0);
1424 BUG_ON(ret);
1425 goto again;
1426 }
1427
1428 wret = btrfs_cow_block(trans, root, b,
1429 p->nodes[level + 1],
1430 p->slots[level + 1],
1431 &b, prealloc_block.objectid);
1432 prealloc_block.objectid = 0;
1433 if (wret) {
1434 free_extent_buffer(b);
1435 ret = wret;
1436 goto done;
1437 }
1438 }
1439cow_done:
1440 BUG_ON(!cow && ins_len);
1441 if (level != btrfs_header_level(b))
1442 WARN_ON(1);
1443 level = btrfs_header_level(b);
1444
1445 p->nodes[level] = b;
1446 if (!p->skip_locking)
1447 p->locks[level] = 1;
1448
1449 ret = check_block(root, p, level);
1450 if (ret) {
1451 ret = -1;
1452 goto done;
1453 }
1454
1455 ret = bin_search(b, key, level, &slot);
1456 if (level != 0) {
1457 if (ret && slot > 0)
1458 slot -= 1;
1459 p->slots[level] = slot;
1460 if ((p->search_for_split || ins_len > 0) &&
1461 btrfs_header_nritems(b) >=
1462 BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
1463 int sret = split_node(trans, root, p, level);
1464 BUG_ON(sret > 0);
1465 if (sret) {
1466 ret = sret;
1467 goto done;
1468 }
1469 b = p->nodes[level];
1470 slot = p->slots[level];
1471 } else if (ins_len < 0) {
1472 int sret = balance_level(trans, root, p,
1473 level);
1474 if (sret) {
1475 ret = sret;
1476 goto done;
1477 }
1478 b = p->nodes[level];
1479 if (!b) {
1480 btrfs_release_path(NULL, p);
1481 goto again;
1482 }
1483 slot = p->slots[level];
1484 BUG_ON(btrfs_header_nritems(b) == 1);
1485 }
1486 unlock_up(p, level, lowest_unlock);
1487
1488 /* this is only true while dropping a snapshot */
1489 if (level == lowest_level) {
1490 ret = 0;
1491 goto done;
1492 }
1493
1494 blocknr = btrfs_node_blockptr(b, slot);
1495 gen = btrfs_node_ptr_generation(b, slot);
1496 blocksize = btrfs_level_size(root, level - 1);
1497
1498 tmp = btrfs_find_tree_block(root, blocknr, blocksize);
1499 if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
1500 b = tmp;
1501 } else {
1502 /*
1503 * reduce lock contention at high levels
1504 * of the btree by dropping locks before
1505 * we read.
1506 */
1507 if (level > 1) {
1508 btrfs_release_path(NULL, p);
1509 if (tmp)
1510 free_extent_buffer(tmp);
1511 if (should_reada)
1512 reada_for_search(root, p,
1513 level, slot,
1514 key->objectid);
1515
1516 tmp = read_tree_block(root, blocknr,
1517 blocksize, gen);
1518 if (tmp)
1519 free_extent_buffer(tmp);
1520 goto again;
1521 } else {
1522 if (tmp)
1523 free_extent_buffer(tmp);
1524 if (should_reada)
1525 reada_for_search(root, p,
1526 level, slot,
1527 key->objectid);
1528 b = read_node_slot(root, b, slot);
1529 }
1530 }
1531 if (!p->skip_locking)
1532 btrfs_tree_lock(b);
1533 } else {
1534 p->slots[level] = slot;
1535 if (ins_len > 0 &&
1536 btrfs_leaf_free_space(root, b) < ins_len) {
1537 int sret = split_leaf(trans, root, key,
1538 p, ins_len, ret == 0);
1539 BUG_ON(sret > 0);
1540 if (sret) {
1541 ret = sret;
1542 goto done;
1543 }
1544 }
1545 if (!p->search_for_split)
1546 unlock_up(p, level, lowest_unlock);
1547 goto done;
1548 }
1549 }
1550 ret = 1;
1551done:
1552 if (prealloc_block.objectid) {
1553 btrfs_free_reserved_extent(root,
1554 prealloc_block.objectid,
1555 prealloc_block.offset);
1556 }
1557
1558 return ret;
1559}
1560
1561int btrfs_merge_path(struct btrfs_trans_handle *trans,
1562 struct btrfs_root *root,
1563 struct btrfs_key *node_keys,
1564 u64 *nodes, int lowest_level)
1565{
1566 struct extent_buffer *eb;
1567 struct extent_buffer *parent;
1568 struct btrfs_key key;
1569 u64 bytenr;
1570 u64 generation;
1571 u32 blocksize;
1572 int level;
1573 int slot;
1574 int key_match;
1575 int ret;
1576
1577 eb = btrfs_lock_root_node(root);
1578 ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0);
1579 BUG_ON(ret);
1580
1581 parent = eb;
1582 while (1) {
1583 level = btrfs_header_level(parent);
1584 if (level == 0 || level <= lowest_level)
1585 break;
1586
1587 ret = bin_search(parent, &node_keys[lowest_level], level,
1588 &slot);
1589 if (ret && slot > 0)
1590 slot--;
1591
1592 bytenr = btrfs_node_blockptr(parent, slot);
1593 if (nodes[level - 1] == bytenr)
1594 break;
1595
1596 blocksize = btrfs_level_size(root, level - 1);
1597 generation = btrfs_node_ptr_generation(parent, slot);
1598 btrfs_node_key_to_cpu(eb, &key, slot);
1599 key_match = !memcmp(&key, &node_keys[level - 1], sizeof(key));
1600
1601 if (generation == trans->transid) {
1602 eb = read_tree_block(root, bytenr, blocksize,
1603 generation);
1604 btrfs_tree_lock(eb);
1605 }
1606
1607 /*
1608 * if node keys match and node pointer hasn't been modified
1609 * in the running transaction, we can merge the path. for
1610 * blocks owened by reloc trees, the node pointer check is
1611 * skipped, this is because these blocks are fully controlled
1612 * by the space balance code, no one else can modify them.
1613 */
1614 if (!nodes[level - 1] || !key_match ||
1615 (generation == trans->transid &&
1616 btrfs_header_owner(eb) != BTRFS_TREE_RELOC_OBJECTID)) {
1617 if (level == 1 || level == lowest_level + 1) {
1618 if (generation == trans->transid) {
1619 btrfs_tree_unlock(eb);
1620 free_extent_buffer(eb);
1621 }
1622 break;
1623 }
1624
1625 if (generation != trans->transid) {
1626 eb = read_tree_block(root, bytenr, blocksize,
1627 generation);
1628 btrfs_tree_lock(eb);
1629 }
1630
1631 ret = btrfs_cow_block(trans, root, eb, parent, slot,
1632 &eb, 0);
1633 BUG_ON(ret);
1634
1635 if (root->root_key.objectid ==
1636 BTRFS_TREE_RELOC_OBJECTID) {
1637 if (!nodes[level - 1]) {
1638 nodes[level - 1] = eb->start;
1639 memcpy(&node_keys[level - 1], &key,
1640 sizeof(node_keys[0]));
1641 } else {
1642 WARN_ON(1);
1643 }
1644 }
1645
1646 btrfs_tree_unlock(parent);
1647 free_extent_buffer(parent);
1648 parent = eb;
1649 continue;
1650 }
1651
1652 btrfs_set_node_blockptr(parent, slot, nodes[level - 1]);
1653 btrfs_set_node_ptr_generation(parent, slot, trans->transid);
1654 btrfs_mark_buffer_dirty(parent);
1655
1656 ret = btrfs_inc_extent_ref(trans, root,
1657 nodes[level - 1],
1658 blocksize, parent->start,
1659 btrfs_header_owner(parent),
1660 btrfs_header_generation(parent),
1661 level - 1);
1662 BUG_ON(ret);
1663
1664 /*
1665 * If the block was created in the running transaction,
1666 * it's possible this is the last reference to it, so we
1667 * should drop the subtree.
1668 */
1669 if (generation == trans->transid) {
1670 ret = btrfs_drop_subtree(trans, root, eb, parent);
1671 BUG_ON(ret);
1672 btrfs_tree_unlock(eb);
1673 free_extent_buffer(eb);
1674 } else {
1675 ret = btrfs_free_extent(trans, root, bytenr,
1676 blocksize, parent->start,
1677 btrfs_header_owner(parent),
1678 btrfs_header_generation(parent),
1679 level - 1, 1);
1680 BUG_ON(ret);
1681 }
1682 break;
1683 }
1684 btrfs_tree_unlock(parent);
1685 free_extent_buffer(parent);
1686 return 0;
1687}
1688
1689/*
1690 * adjust the pointers going up the tree, starting at level
1691 * making sure the right key of each node is points to 'key'.
1692 * This is used after shifting pointers to the left, so it stops
1693 * fixing up pointers when a given leaf/node is not in slot 0 of the
1694 * higher levels
1695 *
1696 * If this fails to write a tree block, it returns -1, but continues
1697 * fixing up the blocks in ram so the tree is consistent.
1698 */
1699static int fixup_low_keys(struct btrfs_trans_handle *trans,
1700 struct btrfs_root *root, struct btrfs_path *path,
1701 struct btrfs_disk_key *key, int level)
1702{
1703 int i;
1704 int ret = 0;
1705 struct extent_buffer *t;
1706
1707 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1708 int tslot = path->slots[i];
1709 if (!path->nodes[i])
1710 break;
1711 t = path->nodes[i];
1712 btrfs_set_node_key(t, key, tslot);
1713 btrfs_mark_buffer_dirty(path->nodes[i]);
1714 if (tslot != 0)
1715 break;
1716 }
1717 return ret;
1718}
1719
1720/*
1721 * update item key.
1722 *
1723 * This function isn't completely safe. It's the caller's responsibility
1724 * that the new key won't break the order
1725 */
1726int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
1727 struct btrfs_root *root, struct btrfs_path *path,
1728 struct btrfs_key *new_key)
1729{
1730 struct btrfs_disk_key disk_key;
1731 struct extent_buffer *eb;
1732 int slot;
1733
1734 eb = path->nodes[0];
1735 slot = path->slots[0];
1736 if (slot > 0) {
1737 btrfs_item_key(eb, &disk_key, slot - 1);
1738 if (comp_keys(&disk_key, new_key) >= 0)
1739 return -1;
1740 }
1741 if (slot < btrfs_header_nritems(eb) - 1) {
1742 btrfs_item_key(eb, &disk_key, slot + 1);
1743 if (comp_keys(&disk_key, new_key) <= 0)
1744 return -1;
1745 }
1746
1747 btrfs_cpu_key_to_disk(&disk_key, new_key);
1748 btrfs_set_item_key(eb, &disk_key, slot);
1749 btrfs_mark_buffer_dirty(eb);
1750 if (slot == 0)
1751 fixup_low_keys(trans, root, path, &disk_key, 1);
1752 return 0;
1753}
1754
1755/*
1756 * try to push data from one node into the next node left in the
1757 * tree.
1758 *
1759 * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
1760 * error, and > 0 if there was no room in the left hand block.
1761 */
1762static int push_node_left(struct btrfs_trans_handle *trans,
1763 struct btrfs_root *root, struct extent_buffer *dst,
1764 struct extent_buffer *src, int empty)
1765{
1766 int push_items = 0;
1767 int src_nritems;
1768 int dst_nritems;
1769 int ret = 0;
1770
1771 src_nritems = btrfs_header_nritems(src);
1772 dst_nritems = btrfs_header_nritems(dst);
1773 push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
1774 WARN_ON(btrfs_header_generation(src) != trans->transid);
1775 WARN_ON(btrfs_header_generation(dst) != trans->transid);
1776
1777 if (!empty && src_nritems <= 8)
1778 return 1;
1779
1780 if (push_items <= 0)
1781 return 1;
1782
1783 if (empty) {
1784 push_items = min(src_nritems, push_items);
1785 if (push_items < src_nritems) {
1786 /* leave at least 8 pointers in the node if
1787 * we aren't going to empty it
1788 */
1789 if (src_nritems - push_items < 8) {
1790 if (push_items <= 8)
1791 return 1;
1792 push_items -= 8;
1793 }
1794 }
1795 } else
1796 push_items = min(src_nritems - 8, push_items);
1797
1798 copy_extent_buffer(dst, src,
1799 btrfs_node_key_ptr_offset(dst_nritems),
1800 btrfs_node_key_ptr_offset(0),
1801 push_items * sizeof(struct btrfs_key_ptr));
1802
1803 if (push_items < src_nritems) {
1804 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
1805 btrfs_node_key_ptr_offset(push_items),
1806 (src_nritems - push_items) *
1807 sizeof(struct btrfs_key_ptr));
1808 }
1809 btrfs_set_header_nritems(src, src_nritems - push_items);
1810 btrfs_set_header_nritems(dst, dst_nritems + push_items);
1811 btrfs_mark_buffer_dirty(src);
1812 btrfs_mark_buffer_dirty(dst);
1813
1814 ret = btrfs_update_ref(trans, root, src, dst, dst_nritems, push_items);
1815 BUG_ON(ret);
1816
1817 return ret;
1818}
1819
1820/*
1821 * try to push data from one node into the next node right in the
1822 * tree.
1823 *
1824 * returns 0 if some ptrs were pushed, < 0 if there was some horrible
1825 * error, and > 0 if there was no room in the right hand block.
1826 *
1827 * this will only push up to 1/2 the contents of the left node over
1828 */
1829static int balance_node_right(struct btrfs_trans_handle *trans,
1830 struct btrfs_root *root,
1831 struct extent_buffer *dst,
1832 struct extent_buffer *src)
1833{
1834 int push_items = 0;
1835 int max_push;
1836 int src_nritems;
1837 int dst_nritems;
1838 int ret = 0;
1839
1840 WARN_ON(btrfs_header_generation(src) != trans->transid);
1841 WARN_ON(btrfs_header_generation(dst) != trans->transid);
1842
1843 src_nritems = btrfs_header_nritems(src);
1844 dst_nritems = btrfs_header_nritems(dst);
1845 push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
1846 if (push_items <= 0)
1847 return 1;
1848
1849 if (src_nritems < 4)
1850 return 1;
1851
1852 max_push = src_nritems / 2 + 1;
1853 /* don't try to empty the node */
1854 if (max_push >= src_nritems)
1855 return 1;
1856
1857 if (max_push < push_items)
1858 push_items = max_push;
1859
1860 memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
1861 btrfs_node_key_ptr_offset(0),
1862 (dst_nritems) *
1863 sizeof(struct btrfs_key_ptr));
1864
1865 copy_extent_buffer(dst, src,
1866 btrfs_node_key_ptr_offset(0),
1867 btrfs_node_key_ptr_offset(src_nritems - push_items),
1868 push_items * sizeof(struct btrfs_key_ptr));
1869
1870 btrfs_set_header_nritems(src, src_nritems - push_items);
1871 btrfs_set_header_nritems(dst, dst_nritems + push_items);
1872
1873 btrfs_mark_buffer_dirty(src);
1874 btrfs_mark_buffer_dirty(dst);
1875
1876 ret = btrfs_update_ref(trans, root, src, dst, 0, push_items);
1877 BUG_ON(ret);
1878
1879 return ret;
1880}
1881
1882/*
1883 * helper function to insert a new root level in the tree.
1884 * A new node is allocated, and a single item is inserted to
1885 * point to the existing root
1886 *
1887 * returns zero on success or < 0 on failure.
1888 */
1889static noinline int insert_new_root(struct btrfs_trans_handle *trans,
1890 struct btrfs_root *root,
1891 struct btrfs_path *path, int level)
1892{
1893 u64 lower_gen;
1894 struct extent_buffer *lower;
1895 struct extent_buffer *c;
1896 struct extent_buffer *old;
1897 struct btrfs_disk_key lower_key;
1898 int ret;
1899
1900 BUG_ON(path->nodes[level]);
1901 BUG_ON(path->nodes[level-1] != root->node);
1902
1903 lower = path->nodes[level-1];
1904 if (level == 1)
1905 btrfs_item_key(lower, &lower_key, 0);
1906 else
1907 btrfs_node_key(lower, &lower_key, 0);
1908
1909 c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
1910 root->root_key.objectid, trans->transid,
1911 level, root->node->start, 0);
1912 if (IS_ERR(c))
1913 return PTR_ERR(c);
1914
1915 memset_extent_buffer(c, 0, 0, root->nodesize);
1916 btrfs_set_header_nritems(c, 1);
1917 btrfs_set_header_level(c, level);
1918 btrfs_set_header_bytenr(c, c->start);
1919 btrfs_set_header_generation(c, trans->transid);
1920 btrfs_set_header_owner(c, root->root_key.objectid);
1921
1922 write_extent_buffer(c, root->fs_info->fsid,
1923 (unsigned long)btrfs_header_fsid(c),
1924 BTRFS_FSID_SIZE);
1925
1926 write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
1927 (unsigned long)btrfs_header_chunk_tree_uuid(c),
1928 BTRFS_UUID_SIZE);
1929
1930 btrfs_set_node_key(c, &lower_key, 0);
1931 btrfs_set_node_blockptr(c, 0, lower->start);
1932 lower_gen = btrfs_header_generation(lower);
1933 WARN_ON(lower_gen != trans->transid);
1934
1935 btrfs_set_node_ptr_generation(c, 0, lower_gen);
1936
1937 btrfs_mark_buffer_dirty(c);
1938
1939 spin_lock(&root->node_lock);
1940 old = root->node;
1941 root->node = c;
1942 spin_unlock(&root->node_lock);
1943
1944 ret = btrfs_update_extent_ref(trans, root, lower->start,
1945 lower->start, c->start,
1946 root->root_key.objectid,
1947 trans->transid, level - 1);
1948 BUG_ON(ret);
1949
1950 /* the super has an extra ref to root->node */
1951 free_extent_buffer(old);
1952
1953 add_root_to_dirty_list(root);
1954 extent_buffer_get(c);
1955 path->nodes[level] = c;
1956 path->locks[level] = 1;
1957 path->slots[level] = 0;
1958 return 0;
1959}
1960
1961/*
1962 * worker function to insert a single pointer in a node.
1963 * the node should have enough room for the pointer already
1964 *
1965 * slot and level indicate where you want the key to go, and
1966 * blocknr is the block the key points to.
1967 *
1968 * returns zero on success and < 0 on any error
1969 */
1970static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
1971 *root, struct btrfs_path *path, struct btrfs_disk_key
1972 *key, u64 bytenr, int slot, int level)
1973{
1974 struct extent_buffer *lower;
1975 int nritems;
1976
1977 BUG_ON(!path->nodes[level]);
1978 lower = path->nodes[level];
1979 nritems = btrfs_header_nritems(lower);
1980 if (slot > nritems)
1981 BUG();
1982 if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root))
1983 BUG();
1984 if (slot != nritems) {
1985 memmove_extent_buffer(lower,
1986 btrfs_node_key_ptr_offset(slot + 1),
1987 btrfs_node_key_ptr_offset(slot),
1988 (nritems - slot) * sizeof(struct btrfs_key_ptr));
1989 }
1990 btrfs_set_node_key(lower, key, slot);
1991 btrfs_set_node_blockptr(lower, slot, bytenr);
1992 WARN_ON(trans->transid == 0);
1993 btrfs_set_node_ptr_generation(lower, slot, trans->transid);
1994 btrfs_set_header_nritems(lower, nritems + 1);
1995 btrfs_mark_buffer_dirty(lower);
1996 return 0;
1997}
1998
1999/*
2000 * split the node at the specified level in path in two.
2001 * The path is corrected to point to the appropriate node after the split
2002 *
2003 * Before splitting this tries to make some room in the node by pushing
2004 * left and right, if either one works, it returns right away.
2005 *
2006 * returns 0 on success and < 0 on failure
2007 */
2008static noinline int split_node(struct btrfs_trans_handle *trans,
2009 struct btrfs_root *root,
2010 struct btrfs_path *path, int level)
2011{
2012 struct extent_buffer *c;
2013 struct extent_buffer *split;
2014 struct btrfs_disk_key disk_key;
2015 int mid;
2016 int ret;
2017 int wret;
2018 u32 c_nritems;
2019
2020 c = path->nodes[level];
2021 WARN_ON(btrfs_header_generation(c) != trans->transid);
2022 if (c == root->node) {
2023 /* trying to split the root, lets make a new one */
2024 ret = insert_new_root(trans, root, path, level + 1);
2025 if (ret)
2026 return ret;
2027 } else {
2028 ret = push_nodes_for_insert(trans, root, path, level);
2029 c = path->nodes[level];
2030 if (!ret && btrfs_header_nritems(c) <
2031 BTRFS_NODEPTRS_PER_BLOCK(root) - 3)
2032 return 0;
2033 if (ret < 0)
2034 return ret;
2035 }
2036
2037 c_nritems = btrfs_header_nritems(c);
2038
2039 split = btrfs_alloc_free_block(trans, root, root->nodesize,
2040 path->nodes[level + 1]->start,
2041 root->root_key.objectid,
2042 trans->transid, level, c->start, 0);
2043 if (IS_ERR(split))
2044 return PTR_ERR(split);
2045
2046 btrfs_set_header_flags(split, btrfs_header_flags(c));
2047 btrfs_set_header_level(split, btrfs_header_level(c));
2048 btrfs_set_header_bytenr(split, split->start);
2049 btrfs_set_header_generation(split, trans->transid);
2050 btrfs_set_header_owner(split, root->root_key.objectid);
2051 btrfs_set_header_flags(split, 0);
2052 write_extent_buffer(split, root->fs_info->fsid,
2053 (unsigned long)btrfs_header_fsid(split),
2054 BTRFS_FSID_SIZE);
2055 write_extent_buffer(split, root->fs_info->chunk_tree_uuid,
2056 (unsigned long)btrfs_header_chunk_tree_uuid(split),
2057 BTRFS_UUID_SIZE);
2058
2059 mid = (c_nritems + 1) / 2;
2060
2061 copy_extent_buffer(split, c,
2062 btrfs_node_key_ptr_offset(0),
2063 btrfs_node_key_ptr_offset(mid),
2064 (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
2065 btrfs_set_header_nritems(split, c_nritems - mid);
2066 btrfs_set_header_nritems(c, mid);
2067 ret = 0;
2068
2069 btrfs_mark_buffer_dirty(c);
2070 btrfs_mark_buffer_dirty(split);
2071
2072 btrfs_node_key(split, &disk_key, 0);
2073 wret = insert_ptr(trans, root, path, &disk_key, split->start,
2074 path->slots[level + 1] + 1,
2075 level + 1);
2076 if (wret)
2077 ret = wret;
2078
2079 ret = btrfs_update_ref(trans, root, c, split, 0, c_nritems - mid);
2080 BUG_ON(ret);
2081
2082 if (path->slots[level] >= mid) {
2083 path->slots[level] -= mid;
2084 btrfs_tree_unlock(c);
2085 free_extent_buffer(c);
2086 path->nodes[level] = split;
2087 path->slots[level + 1] += 1;
2088 } else {
2089 btrfs_tree_unlock(split);
2090 free_extent_buffer(split);
2091 }
2092 return ret;
2093}
2094
2095/*
2096 * how many bytes are required to store the items in a leaf. start
2097 * and nr indicate which items in the leaf to check. This totals up the
2098 * space used both by the item structs and the item data
2099 */
2100static int leaf_space_used(struct extent_buffer *l, int start, int nr)
2101{
2102 int data_len;
2103 int nritems = btrfs_header_nritems(l);
2104 int end = min(nritems, start + nr) - 1;
2105
2106 if (!nr)
2107 return 0;
2108 data_len = btrfs_item_end_nr(l, start);
2109 data_len = data_len - btrfs_item_offset_nr(l, end);
2110 data_len += sizeof(struct btrfs_item) * nr;
2111 WARN_ON(data_len < 0);
2112 return data_len;
2113}
2114
2115/*
2116 * The space between the end of the leaf items and
2117 * the start of the leaf data. IOW, how much room
2118 * the leaf has left for both items and data
2119 */
2120noinline int btrfs_leaf_free_space(struct btrfs_root *root,
2121 struct extent_buffer *leaf)
2122{
2123 int nritems = btrfs_header_nritems(leaf);
2124 int ret;
2125 ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
2126 if (ret < 0) {
2127 printk(KERN_CRIT "leaf free space ret %d, leaf data size %lu, "
2128 "used %d nritems %d\n",
2129 ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
2130 leaf_space_used(leaf, 0, nritems), nritems);
2131 }
2132 return ret;
2133}
2134
2135/*
2136 * push some data in the path leaf to the right, trying to free up at
2137 * least data_size bytes. returns zero if the push worked, nonzero otherwise
2138 *
2139 * returns 1 if the push failed because the other node didn't have enough
2140 * room, 0 if everything worked out and < 0 if there were major errors.
2141 */
2142static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2143 *root, struct btrfs_path *path, int data_size,
2144 int empty)
2145{
2146 struct extent_buffer *left = path->nodes[0];
2147 struct extent_buffer *right;
2148 struct extent_buffer *upper;
2149 struct btrfs_disk_key disk_key;
2150 int slot;
2151 u32 i;
2152 int free_space;
2153 int push_space = 0;
2154 int push_items = 0;
2155 struct btrfs_item *item;
2156 u32 left_nritems;
2157 u32 nr;
2158 u32 right_nritems;
2159 u32 data_end;
2160 u32 this_item_size;
2161 int ret;
2162
2163 slot = path->slots[1];
2164 if (!path->nodes[1])
2165 return 1;
2166
2167 upper = path->nodes[1];
2168 if (slot >= btrfs_header_nritems(upper) - 1)
2169 return 1;
2170
2171 WARN_ON(!btrfs_tree_locked(path->nodes[1]));
2172
2173 right = read_node_slot(root, upper, slot + 1);
2174 btrfs_tree_lock(right);
2175 free_space = btrfs_leaf_free_space(root, right);
2176 if (free_space < data_size)
2177 goto out_unlock;
2178
2179 /* cow and double check */
2180 ret = btrfs_cow_block(trans, root, right, upper,
2181 slot + 1, &right, 0);
2182 if (ret)
2183 goto out_unlock;
2184
2185 free_space = btrfs_leaf_free_space(root, right);
2186 if (free_space < data_size)
2187 goto out_unlock;
2188
2189 left_nritems = btrfs_header_nritems(left);
2190 if (left_nritems == 0)
2191 goto out_unlock;
2192
2193 if (empty)
2194 nr = 0;
2195 else
2196 nr = 1;
2197
2198 if (path->slots[0] >= left_nritems)
2199 push_space += data_size;
2200
2201 i = left_nritems - 1;
2202 while (i >= nr) {
2203 item = btrfs_item_nr(left, i);
2204
2205 if (!empty && push_items > 0) {
2206 if (path->slots[0] > i)
2207 break;
2208 if (path->slots[0] == i) {
2209 int space = btrfs_leaf_free_space(root, left);
2210 if (space + push_space * 2 > free_space)
2211 break;
2212 }
2213 }
2214
2215 if (path->slots[0] == i)
2216 push_space += data_size;
2217
2218 if (!left->map_token) {
2219 map_extent_buffer(left, (unsigned long)item,
2220 sizeof(struct btrfs_item),
2221 &left->map_token, &left->kaddr,
2222 &left->map_start, &left->map_len,
2223 KM_USER1);
2224 }
2225
2226 this_item_size = btrfs_item_size(left, item);
2227 if (this_item_size + sizeof(*item) + push_space > free_space)
2228 break;
2229
2230 push_items++;
2231 push_space += this_item_size + sizeof(*item);
2232 if (i == 0)
2233 break;
2234 i--;
2235 }
2236 if (left->map_token) {
2237 unmap_extent_buffer(left, left->map_token, KM_USER1);
2238 left->map_token = NULL;
2239 }
2240
2241 if (push_items == 0)
2242 goto out_unlock;
2243
2244 if (!empty && push_items == left_nritems)
2245 WARN_ON(1);
2246
2247 /* push left to right */
2248 right_nritems = btrfs_header_nritems(right);
2249
2250 push_space = btrfs_item_end_nr(left, left_nritems - push_items);
2251 push_space -= leaf_data_end(root, left);
2252
2253 /* make room in the right data area */
2254 data_end = leaf_data_end(root, right);
2255 memmove_extent_buffer(right,
2256 btrfs_leaf_data(right) + data_end - push_space,
2257 btrfs_leaf_data(right) + data_end,
2258 BTRFS_LEAF_DATA_SIZE(root) - data_end);
2259
2260 /* copy from the left data area */
2261 copy_extent_buffer(right, left, btrfs_leaf_data(right) +
2262 BTRFS_LEAF_DATA_SIZE(root) - push_space,
2263 btrfs_leaf_data(left) + leaf_data_end(root, left),
2264 push_space);
2265
2266 memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
2267 btrfs_item_nr_offset(0),
2268 right_nritems * sizeof(struct btrfs_item));
2269
2270 /* copy the items from left to right */
2271 copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
2272 btrfs_item_nr_offset(left_nritems - push_items),
2273 push_items * sizeof(struct btrfs_item));
2274
2275 /* update the item pointers */
2276 right_nritems += push_items;
2277 btrfs_set_header_nritems(right, right_nritems);
2278 push_space = BTRFS_LEAF_DATA_SIZE(root);
2279 for (i = 0; i < right_nritems; i++) {
2280 item = btrfs_item_nr(right, i);
2281 if (!right->map_token) {
2282 map_extent_buffer(right, (unsigned long)item,
2283 sizeof(struct btrfs_item),
2284 &right->map_token, &right->kaddr,
2285 &right->map_start, &right->map_len,
2286 KM_USER1);
2287 }
2288 push_space -= btrfs_item_size(right, item);
2289 btrfs_set_item_offset(right, item, push_space);
2290 }
2291
2292 if (right->map_token) {
2293 unmap_extent_buffer(right, right->map_token, KM_USER1);
2294 right->map_token = NULL;
2295 }
2296 left_nritems -= push_items;
2297 btrfs_set_header_nritems(left, left_nritems);
2298
2299 if (left_nritems)
2300 btrfs_mark_buffer_dirty(left);
2301 btrfs_mark_buffer_dirty(right);
2302
2303 ret = btrfs_update_ref(trans, root, left, right, 0, push_items);
2304 BUG_ON(ret);
2305
2306 btrfs_item_key(right, &disk_key, 0);
2307 btrfs_set_node_key(upper, &disk_key, slot + 1);
2308 btrfs_mark_buffer_dirty(upper);
2309
2310 /* then fixup the leaf pointer in the path */
2311 if (path->slots[0] >= left_nritems) {
2312 path->slots[0] -= left_nritems;
2313 if (btrfs_header_nritems(path->nodes[0]) == 0)
2314 clean_tree_block(trans, root, path->nodes[0]);
2315 btrfs_tree_unlock(path->nodes[0]);
2316 free_extent_buffer(path->nodes[0]);
2317 path->nodes[0] = right;
2318 path->slots[1] += 1;
2319 } else {
2320 btrfs_tree_unlock(right);
2321 free_extent_buffer(right);
2322 }
2323 return 0;
2324
2325out_unlock:
2326 btrfs_tree_unlock(right);
2327 free_extent_buffer(right);
2328 return 1;
2329}
2330
2331/*
2332 * push some data in the path leaf to the left, trying to free up at
2333 * least data_size bytes. returns zero if the push worked, nonzero otherwise
2334 */
2335static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
2336 *root, struct btrfs_path *path, int data_size,
2337 int empty)
2338{
2339 struct btrfs_disk_key disk_key;
2340 struct extent_buffer *right = path->nodes[0];
2341 struct extent_buffer *left;
2342 int slot;
2343 int i;
2344 int free_space;
2345 int push_space = 0;
2346 int push_items = 0;
2347 struct btrfs_item *item;
2348 u32 old_left_nritems;
2349 u32 right_nritems;
2350 u32 nr;
2351 int ret = 0;
2352 int wret;
2353 u32 this_item_size;
2354 u32 old_left_item_size;
2355
2356 slot = path->slots[1];
2357 if (slot == 0)
2358 return 1;
2359 if (!path->nodes[1])
2360 return 1;
2361
2362 right_nritems = btrfs_header_nritems(right);
2363 if (right_nritems == 0)
2364 return 1;
2365
2366 WARN_ON(!btrfs_tree_locked(path->nodes[1]));
2367
2368 left = read_node_slot(root, path->nodes[1], slot - 1);
2369 btrfs_tree_lock(left);
2370 free_space = btrfs_leaf_free_space(root, left);
2371 if (free_space < data_size) {
2372 ret = 1;
2373 goto out;
2374 }
2375
2376 /* cow and double check */
2377 ret = btrfs_cow_block(trans, root, left,
2378 path->nodes[1], slot - 1, &left, 0);
2379 if (ret) {
2380 /* we hit -ENOSPC, but it isn't fatal here */
2381 ret = 1;
2382 goto out;
2383 }
2384
2385 free_space = btrfs_leaf_free_space(root, left);
2386 if (free_space < data_size) {
2387 ret = 1;
2388 goto out;
2389 }
2390
2391 if (empty)
2392 nr = right_nritems;
2393 else
2394 nr = right_nritems - 1;
2395
2396 for (i = 0; i < nr; i++) {
2397 item = btrfs_item_nr(right, i);
2398 if (!right->map_token) {
2399 map_extent_buffer(right, (unsigned long)item,
2400 sizeof(struct btrfs_item),
2401 &right->map_token, &right->kaddr,
2402 &right->map_start, &right->map_len,
2403 KM_USER1);
2404 }
2405
2406 if (!empty && push_items > 0) {
2407 if (path->slots[0] < i)
2408 break;
2409 if (path->slots[0] == i) {
2410 int space = btrfs_leaf_free_space(root, right);
2411 if (space + push_space * 2 > free_space)
2412 break;
2413 }
2414 }
2415
2416 if (path->slots[0] == i)
2417 push_space += data_size;
2418
2419 this_item_size = btrfs_item_size(right, item);
2420 if (this_item_size + sizeof(*item) + push_space > free_space)
2421 break;
2422
2423 push_items++;
2424 push_space += this_item_size + sizeof(*item);
2425 }
2426
2427 if (right->map_token) {
2428 unmap_extent_buffer(right, right->map_token, KM_USER1);
2429 right->map_token = NULL;
2430 }
2431
2432 if (push_items == 0) {
2433 ret = 1;
2434 goto out;
2435 }
2436 if (!empty && push_items == btrfs_header_nritems(right))
2437 WARN_ON(1);
2438
2439 /* push data from right to left */
2440 copy_extent_buffer(left, right,
2441 btrfs_item_nr_offset(btrfs_header_nritems(left)),
2442 btrfs_item_nr_offset(0),
2443 push_items * sizeof(struct btrfs_item));
2444
2445 push_space = BTRFS_LEAF_DATA_SIZE(root) -
2446 btrfs_item_offset_nr(right, push_items - 1);
2447
2448 copy_extent_buffer(left, right, btrfs_leaf_data(left) +
2449 leaf_data_end(root, left) - push_space,
2450 btrfs_leaf_data(right) +
2451 btrfs_item_offset_nr(right, push_items - 1),
2452 push_space);
2453 old_left_nritems = btrfs_header_nritems(left);
2454 BUG_ON(old_left_nritems <= 0);
2455
2456 old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
2457 for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
2458 u32 ioff;
2459
2460 item = btrfs_item_nr(left, i);
2461 if (!left->map_token) {
2462 map_extent_buffer(left, (unsigned long)item,
2463 sizeof(struct btrfs_item),
2464 &left->map_token, &left->kaddr,
2465 &left->map_start, &left->map_len,
2466 KM_USER1);
2467 }
2468
2469 ioff = btrfs_item_offset(left, item);
2470 btrfs_set_item_offset(left, item,
2471 ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size));
2472 }
2473 btrfs_set_header_nritems(left, old_left_nritems + push_items);
2474 if (left->map_token) {
2475 unmap_extent_buffer(left, left->map_token, KM_USER1);
2476 left->map_token = NULL;
2477 }
2478
2479 /* fixup right node */
2480 if (push_items > right_nritems) {
2481 printk(KERN_CRIT "push items %d nr %u\n", push_items,
2482 right_nritems);
2483 WARN_ON(1);
2484 }
2485
2486 if (push_items < right_nritems) {
2487 push_space = btrfs_item_offset_nr(right, push_items - 1) -
2488 leaf_data_end(root, right);
2489 memmove_extent_buffer(right, btrfs_leaf_data(right) +
2490 BTRFS_LEAF_DATA_SIZE(root) - push_space,
2491 btrfs_leaf_data(right) +
2492 leaf_data_end(root, right), push_space);
2493
2494 memmove_extent_buffer(right, btrfs_item_nr_offset(0),
2495 btrfs_item_nr_offset(push_items),
2496 (btrfs_header_nritems(right) - push_items) *
2497 sizeof(struct btrfs_item));
2498 }
2499 right_nritems -= push_items;
2500 btrfs_set_header_nritems(right, right_nritems);
2501 push_space = BTRFS_LEAF_DATA_SIZE(root);
2502 for (i = 0; i < right_nritems; i++) {
2503 item = btrfs_item_nr(right, i);
2504
2505 if (!right->map_token) {
2506 map_extent_buffer(right, (unsigned long)item,
2507 sizeof(struct btrfs_item),
2508 &right->map_token, &right->kaddr,
2509 &right->map_start, &right->map_len,
2510 KM_USER1);
2511 }
2512
2513 push_space = push_space - btrfs_item_size(right, item);
2514 btrfs_set_item_offset(right, item, push_space);
2515 }
2516 if (right->map_token) {
2517 unmap_extent_buffer(right, right->map_token, KM_USER1);
2518 right->map_token = NULL;
2519 }
2520
2521 btrfs_mark_buffer_dirty(left);
2522 if (right_nritems)
2523 btrfs_mark_buffer_dirty(right);
2524
2525 ret = btrfs_update_ref(trans, root, right, left,
2526 old_left_nritems, push_items);
2527 BUG_ON(ret);
2528
2529 btrfs_item_key(right, &disk_key, 0);
2530 wret = fixup_low_keys(trans, root, path, &disk_key, 1);
2531 if (wret)
2532 ret = wret;
2533
2534 /* then fixup the leaf pointer in the path */
2535 if (path->slots[0] < push_items) {
2536 path->slots[0] += old_left_nritems;
2537 if (btrfs_header_nritems(path->nodes[0]) == 0)
2538 clean_tree_block(trans, root, path->nodes[0]);
2539 btrfs_tree_unlock(path->nodes[0]);
2540 free_extent_buffer(path->nodes[0]);
2541 path->nodes[0] = left;
2542 path->slots[1] -= 1;
2543 } else {
2544 btrfs_tree_unlock(left);
2545 free_extent_buffer(left);
2546 path->slots[0] -= push_items;
2547 }
2548 BUG_ON(path->slots[0] < 0);
2549 return ret;
2550out:
2551 btrfs_tree_unlock(left);
2552 free_extent_buffer(left);
2553 return ret;
2554}
2555
2556/*
2557 * split the path's leaf in two, making sure there is at least data_size
2558 * available for the resulting leaf level of the path.
2559 *
2560 * returns 0 if all went well and < 0 on failure.
2561 */
2562static noinline int split_leaf(struct btrfs_trans_handle *trans,
2563 struct btrfs_root *root,
2564 struct btrfs_key *ins_key,
2565 struct btrfs_path *path, int data_size,
2566 int extend)
2567{
2568 struct extent_buffer *l;
2569 u32 nritems;
2570 int mid;
2571 int slot;
2572 struct extent_buffer *right;
2573 int data_copy_size;
2574 int rt_data_off;
2575 int i;
2576 int ret = 0;
2577 int wret;
2578 int double_split;
2579 int num_doubles = 0;
2580 struct btrfs_disk_key disk_key;
2581
2582 /* first try to make some room by pushing left and right */
2583 if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) {
2584 wret = push_leaf_right(trans, root, path, data_size, 0);
2585 if (wret < 0)
2586 return wret;
2587 if (wret) {
2588 wret = push_leaf_left(trans, root, path, data_size, 0);
2589 if (wret < 0)
2590 return wret;
2591 }
2592 l = path->nodes[0];
2593
2594 /* did the pushes work? */
2595 if (btrfs_leaf_free_space(root, l) >= data_size)
2596 return 0;
2597 }
2598
2599 if (!path->nodes[1]) {
2600 ret = insert_new_root(trans, root, path, 1);
2601 if (ret)
2602 return ret;
2603 }
2604again:
2605 double_split = 0;
2606 l = path->nodes[0];
2607 slot = path->slots[0];
2608 nritems = btrfs_header_nritems(l);
2609 mid = (nritems + 1) / 2;
2610
2611 right = btrfs_alloc_free_block(trans, root, root->leafsize,
2612 path->nodes[1]->start,
2613 root->root_key.objectid,
2614 trans->transid, 0, l->start, 0);
2615 if (IS_ERR(right)) {
2616 BUG_ON(1);
2617 return PTR_ERR(right);
2618 }
2619
2620 memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
2621 btrfs_set_header_bytenr(right, right->start);
2622 btrfs_set_header_generation(right, trans->transid);
2623 btrfs_set_header_owner(right, root->root_key.objectid);
2624 btrfs_set_header_level(right, 0);
2625 write_extent_buffer(right, root->fs_info->fsid,
2626 (unsigned long)btrfs_header_fsid(right),
2627 BTRFS_FSID_SIZE);
2628
2629 write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
2630 (unsigned long)btrfs_header_chunk_tree_uuid(right),
2631 BTRFS_UUID_SIZE);
2632 if (mid <= slot) {
2633 if (nritems == 1 ||
2634 leaf_space_used(l, mid, nritems - mid) + data_size >
2635 BTRFS_LEAF_DATA_SIZE(root)) {
2636 if (slot >= nritems) {
2637 btrfs_cpu_key_to_disk(&disk_key, ins_key);
2638 btrfs_set_header_nritems(right, 0);
2639 wret = insert_ptr(trans, root, path,
2640 &disk_key, right->start,
2641 path->slots[1] + 1, 1);
2642 if (wret)
2643 ret = wret;
2644
2645 btrfs_tree_unlock(path->nodes[0]);
2646 free_extent_buffer(path->nodes[0]);
2647 path->nodes[0] = right;
2648 path->slots[0] = 0;
2649 path->slots[1] += 1;
2650 btrfs_mark_buffer_dirty(right);
2651 return ret;
2652 }
2653 mid = slot;
2654 if (mid != nritems &&
2655 leaf_space_used(l, mid, nritems - mid) +
2656 data_size > BTRFS_LEAF_DATA_SIZE(root)) {
2657 double_split = 1;
2658 }
2659 }
2660 } else {
2661 if (leaf_space_used(l, 0, mid) + data_size >
2662 BTRFS_LEAF_DATA_SIZE(root)) {
2663 if (!extend && data_size && slot == 0) {
2664 btrfs_cpu_key_to_disk(&disk_key, ins_key);
2665 btrfs_set_header_nritems(right, 0);
2666 wret = insert_ptr(trans, root, path,
2667 &disk_key,
2668 right->start,
2669 path->slots[1], 1);
2670 if (wret)
2671 ret = wret;
2672 btrfs_tree_unlock(path->nodes[0]);
2673 free_extent_buffer(path->nodes[0]);
2674 path->nodes[0] = right;
2675 path->slots[0] = 0;
2676 if (path->slots[1] == 0) {
2677 wret = fixup_low_keys(trans, root,
2678 path, &disk_key, 1);
2679 if (wret)
2680 ret = wret;
2681 }
2682 btrfs_mark_buffer_dirty(right);
2683 return ret;
2684 } else if ((extend || !data_size) && slot == 0) {
2685 mid = 1;
2686 } else {
2687 mid = slot;
2688 if (mid != nritems &&
2689 leaf_space_used(l, mid, nritems - mid) +
2690 data_size > BTRFS_LEAF_DATA_SIZE(root)) {
2691 double_split = 1;
2692 }
2693 }
2694 }
2695 }
2696 nritems = nritems - mid;
2697 btrfs_set_header_nritems(right, nritems);
2698 data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
2699
2700 copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
2701 btrfs_item_nr_offset(mid),
2702 nritems * sizeof(struct btrfs_item));
2703
2704 copy_extent_buffer(right, l,
2705 btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
2706 data_copy_size, btrfs_leaf_data(l) +
2707 leaf_data_end(root, l), data_copy_size);
2708
2709 rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
2710 btrfs_item_end_nr(l, mid);
2711
2712 for (i = 0; i < nritems; i++) {
2713 struct btrfs_item *item = btrfs_item_nr(right, i);
2714 u32 ioff;
2715
2716 if (!right->map_token) {
2717 map_extent_buffer(right, (unsigned long)item,
2718 sizeof(struct btrfs_item),
2719 &right->map_token, &right->kaddr,
2720 &right->map_start, &right->map_len,
2721 KM_USER1);
2722 }
2723
2724 ioff = btrfs_item_offset(right, item);
2725 btrfs_set_item_offset(right, item, ioff + rt_data_off);
2726 }
2727
2728 if (right->map_token) {
2729 unmap_extent_buffer(right, right->map_token, KM_USER1);
2730 right->map_token = NULL;
2731 }
2732
2733 btrfs_set_header_nritems(l, mid);
2734 ret = 0;
2735 btrfs_item_key(right, &disk_key, 0);
2736 wret = insert_ptr(trans, root, path, &disk_key, right->start,
2737 path->slots[1] + 1, 1);
2738 if (wret)
2739 ret = wret;
2740
2741 btrfs_mark_buffer_dirty(right);
2742 btrfs_mark_buffer_dirty(l);
2743 BUG_ON(path->slots[0] != slot);
2744
2745 ret = btrfs_update_ref(trans, root, l, right, 0, nritems);
2746 BUG_ON(ret);
2747
2748 if (mid <= slot) {
2749 btrfs_tree_unlock(path->nodes[0]);
2750 free_extent_buffer(path->nodes[0]);
2751 path->nodes[0] = right;
2752 path->slots[0] -= mid;
2753 path->slots[1] += 1;
2754 } else {
2755 btrfs_tree_unlock(right);
2756 free_extent_buffer(right);
2757 }
2758
2759 BUG_ON(path->slots[0] < 0);
2760
2761 if (double_split) {
2762 BUG_ON(num_doubles != 0);
2763 num_doubles++;
2764 goto again;
2765 }
2766 return ret;
2767}
2768
2769/*
2770 * This function splits a single item into two items,
2771 * giving 'new_key' to the new item and splitting the
2772 * old one at split_offset (from the start of the item).
2773 *
2774 * The path may be released by this operation. After
2775 * the split, the path is pointing to the old item. The
2776 * new item is going to be in the same node as the old one.
2777 *
2778 * Note, the item being split must be smaller enough to live alone on
2779 * a tree block with room for one extra struct btrfs_item
2780 *
2781 * This allows us to split the item in place, keeping a lock on the
2782 * leaf the entire time.
2783 */
2784int btrfs_split_item(struct btrfs_trans_handle *trans,
2785 struct btrfs_root *root,
2786 struct btrfs_path *path,
2787 struct btrfs_key *new_key,
2788 unsigned long split_offset)
2789{
2790 u32 item_size;
2791 struct extent_buffer *leaf;
2792 struct btrfs_key orig_key;
2793 struct btrfs_item *item;
2794 struct btrfs_item *new_item;
2795 int ret = 0;
2796 int slot;
2797 u32 nritems;
2798 u32 orig_offset;
2799 struct btrfs_disk_key disk_key;
2800 char *buf;
2801
2802 leaf = path->nodes[0];
2803 btrfs_item_key_to_cpu(leaf, &orig_key, path->slots[0]);
2804 if (btrfs_leaf_free_space(root, leaf) >= sizeof(struct btrfs_item))
2805 goto split;
2806
2807 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2808 btrfs_release_path(root, path);
2809
2810 path->search_for_split = 1;
2811 path->keep_locks = 1;
2812
2813 ret = btrfs_search_slot(trans, root, &orig_key, path, 0, 1);
2814 path->search_for_split = 0;
2815
2816 /* if our item isn't there or got smaller, return now */
2817 if (ret != 0 || item_size != btrfs_item_size_nr(path->nodes[0],
2818 path->slots[0])) {
2819 path->keep_locks = 0;
2820 return -EAGAIN;
2821 }
2822
2823 ret = split_leaf(trans, root, &orig_key, path,
2824 sizeof(struct btrfs_item), 1);
2825 path->keep_locks = 0;
2826 BUG_ON(ret);
2827
2828 leaf = path->nodes[0];
2829 BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
2830
2831split:
2832 item = btrfs_item_nr(leaf, path->slots[0]);
2833 orig_offset = btrfs_item_offset(leaf, item);
2834 item_size = btrfs_item_size(leaf, item);
2835
2836
2837 buf = kmalloc(item_size, GFP_NOFS);
2838 read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
2839 path->slots[0]), item_size);
2840 slot = path->slots[0] + 1;
2841 leaf = path->nodes[0];
2842
2843 nritems = btrfs_header_nritems(leaf);
2844
2845 if (slot != nritems) {
2846 /* shift the items */
2847 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
2848 btrfs_item_nr_offset(slot),
2849 (nritems - slot) * sizeof(struct btrfs_item));
2850
2851 }
2852
2853 btrfs_cpu_key_to_disk(&disk_key, new_key);
2854 btrfs_set_item_key(leaf, &disk_key, slot);
2855
2856 new_item = btrfs_item_nr(leaf, slot);
2857
2858 btrfs_set_item_offset(leaf, new_item, orig_offset);
2859 btrfs_set_item_size(leaf, new_item, item_size - split_offset);
2860
2861 btrfs_set_item_offset(leaf, item,
2862 orig_offset + item_size - split_offset);
2863 btrfs_set_item_size(leaf, item, split_offset);
2864
2865 btrfs_set_header_nritems(leaf, nritems + 1);
2866
2867 /* write the data for the start of the original item */
2868 write_extent_buffer(leaf, buf,
2869 btrfs_item_ptr_offset(leaf, path->slots[0]),
2870 split_offset);
2871
2872 /* write the data for the new item */
2873 write_extent_buffer(leaf, buf + split_offset,
2874 btrfs_item_ptr_offset(leaf, slot),
2875 item_size - split_offset);
2876 btrfs_mark_buffer_dirty(leaf);
2877
2878 ret = 0;
2879 if (btrfs_leaf_free_space(root, leaf) < 0) {
2880 btrfs_print_leaf(root, leaf);
2881 BUG();
2882 }
2883 kfree(buf);
2884 return ret;
2885}
2886
2887/*
2888 * make the item pointed to by the path smaller. new_size indicates
2889 * how small to make it, and from_end tells us if we just chop bytes
2890 * off the end of the item or if we shift the item to chop bytes off
2891 * the front.
2892 */
2893int btrfs_truncate_item(struct btrfs_trans_handle *trans,
2894 struct btrfs_root *root,
2895 struct btrfs_path *path,
2896 u32 new_size, int from_end)
2897{
2898 int ret = 0;
2899 int slot;
2900 int slot_orig;
2901 struct extent_buffer *leaf;
2902 struct btrfs_item *item;
2903 u32 nritems;
2904 unsigned int data_end;
2905 unsigned int old_data_start;
2906 unsigned int old_size;
2907 unsigned int size_diff;
2908 int i;
2909
2910 slot_orig = path->slots[0];
2911 leaf = path->nodes[0];
2912 slot = path->slots[0];
2913
2914 old_size = btrfs_item_size_nr(leaf, slot);
2915 if (old_size == new_size)
2916 return 0;
2917
2918 nritems = btrfs_header_nritems(leaf);
2919 data_end = leaf_data_end(root, leaf);
2920
2921 old_data_start = btrfs_item_offset_nr(leaf, slot);
2922
2923 size_diff = old_size - new_size;
2924
2925 BUG_ON(slot < 0);
2926 BUG_ON(slot >= nritems);
2927
2928 /*
2929 * item0..itemN ... dataN.offset..dataN.size .. data0.size
2930 */
2931 /* first correct the data pointers */
2932 for (i = slot; i < nritems; i++) {
2933 u32 ioff;
2934 item = btrfs_item_nr(leaf, i);
2935
2936 if (!leaf->map_token) {
2937 map_extent_buffer(leaf, (unsigned long)item,
2938 sizeof(struct btrfs_item),
2939 &leaf->map_token, &leaf->kaddr,
2940 &leaf->map_start, &leaf->map_len,
2941 KM_USER1);
2942 }
2943
2944 ioff = btrfs_item_offset(leaf, item);
2945 btrfs_set_item_offset(leaf, item, ioff + size_diff);
2946 }
2947
2948 if (leaf->map_token) {
2949 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
2950 leaf->map_token = NULL;
2951 }
2952
2953 /* shift the data */
2954 if (from_end) {
2955 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
2956 data_end + size_diff, btrfs_leaf_data(leaf) +
2957 data_end, old_data_start + new_size - data_end);
2958 } else {
2959 struct btrfs_disk_key disk_key;
2960 u64 offset;
2961
2962 btrfs_item_key(leaf, &disk_key, slot);
2963
2964 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
2965 unsigned long ptr;
2966 struct btrfs_file_extent_item *fi;
2967
2968 fi = btrfs_item_ptr(leaf, slot,
2969 struct btrfs_file_extent_item);
2970 fi = (struct btrfs_file_extent_item *)(
2971 (unsigned long)fi - size_diff);
2972
2973 if (btrfs_file_extent_type(leaf, fi) ==
2974 BTRFS_FILE_EXTENT_INLINE) {
2975 ptr = btrfs_item_ptr_offset(leaf, slot);
2976 memmove_extent_buffer(leaf, ptr,
2977 (unsigned long)fi,
2978 offsetof(struct btrfs_file_extent_item,
2979 disk_bytenr));
2980 }
2981 }
2982
2983 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
2984 data_end + size_diff, btrfs_leaf_data(leaf) +
2985 data_end, old_data_start - data_end);
2986
2987 offset = btrfs_disk_key_offset(&disk_key);
2988 btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
2989 btrfs_set_item_key(leaf, &disk_key, slot);
2990 if (slot == 0)
2991 fixup_low_keys(trans, root, path, &disk_key, 1);
2992 }
2993
2994 item = btrfs_item_nr(leaf, slot);
2995 btrfs_set_item_size(leaf, item, new_size);
2996 btrfs_mark_buffer_dirty(leaf);
2997
2998 ret = 0;
2999 if (btrfs_leaf_free_space(root, leaf) < 0) {
3000 btrfs_print_leaf(root, leaf);
3001 BUG();
3002 }
3003 return ret;
3004}
3005
3006/*
3007 * make the item pointed to by the path bigger, data_size is the new size.
3008 */
3009int btrfs_extend_item(struct btrfs_trans_handle *trans,
3010 struct btrfs_root *root, struct btrfs_path *path,
3011 u32 data_size)
3012{
3013 int ret = 0;
3014 int slot;
3015 int slot_orig;
3016 struct extent_buffer *leaf;
3017 struct btrfs_item *item;
3018 u32 nritems;
3019 unsigned int data_end;
3020 unsigned int old_data;
3021 unsigned int old_size;
3022 int i;
3023
3024 slot_orig = path->slots[0];
3025 leaf = path->nodes[0];
3026
3027 nritems = btrfs_header_nritems(leaf);
3028 data_end = leaf_data_end(root, leaf);
3029
3030 if (btrfs_leaf_free_space(root, leaf) < data_size) {
3031 btrfs_print_leaf(root, leaf);
3032 BUG();
3033 }
3034 slot = path->slots[0];
3035 old_data = btrfs_item_end_nr(leaf, slot);
3036
3037 BUG_ON(slot < 0);
3038 if (slot >= nritems) {
3039 btrfs_print_leaf(root, leaf);
3040 printk(KERN_CRIT "slot %d too large, nritems %d\n",
3041 slot, nritems);
3042 BUG_ON(1);
3043 }
3044
3045 /*
3046 * item0..itemN ... dataN.offset..dataN.size .. data0.size
3047 */
3048 /* first correct the data pointers */
3049 for (i = slot; i < nritems; i++) {
3050 u32 ioff;
3051 item = btrfs_item_nr(leaf, i);
3052
3053 if (!leaf->map_token) {
3054 map_extent_buffer(leaf, (unsigned long)item,
3055 sizeof(struct btrfs_item),
3056 &leaf->map_token, &leaf->kaddr,
3057 &leaf->map_start, &leaf->map_len,
3058 KM_USER1);
3059 }
3060 ioff = btrfs_item_offset(leaf, item);
3061 btrfs_set_item_offset(leaf, item, ioff - data_size);
3062 }
3063
3064 if (leaf->map_token) {
3065 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3066 leaf->map_token = NULL;
3067 }
3068
3069 /* shift the data */
3070 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3071 data_end - data_size, btrfs_leaf_data(leaf) +
3072 data_end, old_data - data_end);
3073
3074 data_end = old_data;
3075 old_size = btrfs_item_size_nr(leaf, slot);
3076 item = btrfs_item_nr(leaf, slot);
3077 btrfs_set_item_size(leaf, item, old_size + data_size);
3078 btrfs_mark_buffer_dirty(leaf);
3079
3080 ret = 0;
3081 if (btrfs_leaf_free_space(root, leaf) < 0) {
3082 btrfs_print_leaf(root, leaf);
3083 BUG();
3084 }
3085 return ret;
3086}
3087
3088/*
3089 * Given a key and some data, insert items into the tree.
3090 * This does all the path init required, making room in the tree if needed.
3091 * Returns the number of keys that were inserted.
3092 */
3093int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
3094 struct btrfs_root *root,
3095 struct btrfs_path *path,
3096 struct btrfs_key *cpu_key, u32 *data_size,
3097 int nr)
3098{
3099 struct extent_buffer *leaf;
3100 struct btrfs_item *item;
3101 int ret = 0;
3102 int slot;
3103 int i;
3104 u32 nritems;
3105 u32 total_data = 0;
3106 u32 total_size = 0;
3107 unsigned int data_end;
3108 struct btrfs_disk_key disk_key;
3109 struct btrfs_key found_key;
3110
3111 for (i = 0; i < nr; i++) {
3112 if (total_size + data_size[i] + sizeof(struct btrfs_item) >
3113 BTRFS_LEAF_DATA_SIZE(root)) {
3114 break;
3115 nr = i;
3116 }
3117 total_data += data_size[i];
3118 total_size += data_size[i] + sizeof(struct btrfs_item);
3119 }
3120 BUG_ON(nr == 0);
3121
3122 ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
3123 if (ret == 0)
3124 return -EEXIST;
3125 if (ret < 0)
3126 goto out;
3127
3128 leaf = path->nodes[0];
3129
3130 nritems = btrfs_header_nritems(leaf);
3131 data_end = leaf_data_end(root, leaf);
3132
3133 if (btrfs_leaf_free_space(root, leaf) < total_size) {
3134 for (i = nr; i >= 0; i--) {
3135 total_data -= data_size[i];
3136 total_size -= data_size[i] + sizeof(struct btrfs_item);
3137 if (total_size < btrfs_leaf_free_space(root, leaf))
3138 break;
3139 }
3140 nr = i;
3141 }
3142
3143 slot = path->slots[0];
3144 BUG_ON(slot < 0);
3145
3146 if (slot != nritems) {
3147 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
3148
3149 item = btrfs_item_nr(leaf, slot);
3150 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3151
3152 /* figure out how many keys we can insert in here */
3153 total_data = data_size[0];
3154 for (i = 1; i < nr; i++) {
3155 if (comp_cpu_keys(&found_key, cpu_key + i) <= 0)
3156 break;
3157 total_data += data_size[i];
3158 }
3159 nr = i;
3160
3161 if (old_data < data_end) {
3162 btrfs_print_leaf(root, leaf);
3163 printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
3164 slot, old_data, data_end);
3165 BUG_ON(1);
3166 }
3167 /*
3168 * item0..itemN ... dataN.offset..dataN.size .. data0.size
3169 */
3170 /* first correct the data pointers */
3171 WARN_ON(leaf->map_token);
3172 for (i = slot; i < nritems; i++) {
3173 u32 ioff;
3174
3175 item = btrfs_item_nr(leaf, i);
3176 if (!leaf->map_token) {
3177 map_extent_buffer(leaf, (unsigned long)item,
3178 sizeof(struct btrfs_item),
3179 &leaf->map_token, &leaf->kaddr,
3180 &leaf->map_start, &leaf->map_len,
3181 KM_USER1);
3182 }
3183
3184 ioff = btrfs_item_offset(leaf, item);
3185 btrfs_set_item_offset(leaf, item, ioff - total_data);
3186 }
3187 if (leaf->map_token) {
3188 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3189 leaf->map_token = NULL;
3190 }
3191
3192 /* shift the items */
3193 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
3194 btrfs_item_nr_offset(slot),
3195 (nritems - slot) * sizeof(struct btrfs_item));
3196
3197 /* shift the data */
3198 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3199 data_end - total_data, btrfs_leaf_data(leaf) +
3200 data_end, old_data - data_end);
3201 data_end = old_data;
3202 } else {
3203 /*
3204 * this sucks but it has to be done, if we are inserting at
3205 * the end of the leaf only insert 1 of the items, since we
3206 * have no way of knowing whats on the next leaf and we'd have
3207 * to drop our current locks to figure it out
3208 */
3209 nr = 1;
3210 }
3211
3212 /* setup the item for the new data */
3213 for (i = 0; i < nr; i++) {
3214 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
3215 btrfs_set_item_key(leaf, &disk_key, slot + i);
3216 item = btrfs_item_nr(leaf, slot + i);
3217 btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
3218 data_end -= data_size[i];
3219 btrfs_set_item_size(leaf, item, data_size[i]);
3220 }
3221 btrfs_set_header_nritems(leaf, nritems + nr);
3222 btrfs_mark_buffer_dirty(leaf);
3223
3224 ret = 0;
3225 if (slot == 0) {
3226 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
3227 ret = fixup_low_keys(trans, root, path, &disk_key, 1);
3228 }
3229
3230 if (btrfs_leaf_free_space(root, leaf) < 0) {
3231 btrfs_print_leaf(root, leaf);
3232 BUG();
3233 }
3234out:
3235 if (!ret)
3236 ret = nr;
3237 return ret;
3238}
3239
3240/*
3241 * Given a key and some data, insert items into the tree.
3242 * This does all the path init required, making room in the tree if needed.
3243 */
3244int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
3245 struct btrfs_root *root,
3246 struct btrfs_path *path,
3247 struct btrfs_key *cpu_key, u32 *data_size,
3248 int nr)
3249{
3250 struct extent_buffer *leaf;
3251 struct btrfs_item *item;
3252 int ret = 0;
3253 int slot;
3254 int slot_orig;
3255 int i;
3256 u32 nritems;
3257 u32 total_size = 0;
3258 u32 total_data = 0;
3259 unsigned int data_end;
3260 struct btrfs_disk_key disk_key;
3261
3262 for (i = 0; i < nr; i++)
3263 total_data += data_size[i];
3264
3265 total_size = total_data + (nr * sizeof(struct btrfs_item));
3266 ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
3267 if (ret == 0)
3268 return -EEXIST;
3269 if (ret < 0)
3270 goto out;
3271
3272 slot_orig = path->slots[0];
3273 leaf = path->nodes[0];
3274
3275 nritems = btrfs_header_nritems(leaf);
3276 data_end = leaf_data_end(root, leaf);
3277
3278 if (btrfs_leaf_free_space(root, leaf) < total_size) {
3279 btrfs_print_leaf(root, leaf);
3280 printk(KERN_CRIT "not enough freespace need %u have %d\n",
3281 total_size, btrfs_leaf_free_space(root, leaf));
3282 BUG();
3283 }
3284
3285 slot = path->slots[0];
3286 BUG_ON(slot < 0);
3287
3288 if (slot != nritems) {
3289 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
3290
3291 if (old_data < data_end) {
3292 btrfs_print_leaf(root, leaf);
3293 printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
3294 slot, old_data, data_end);
3295 BUG_ON(1);
3296 }
3297 /*
3298 * item0..itemN ... dataN.offset..dataN.size .. data0.size
3299 */
3300 /* first correct the data pointers */
3301 WARN_ON(leaf->map_token);
3302 for (i = slot; i < nritems; i++) {
3303 u32 ioff;
3304
3305 item = btrfs_item_nr(leaf, i);
3306 if (!leaf->map_token) {
3307 map_extent_buffer(leaf, (unsigned long)item,
3308 sizeof(struct btrfs_item),
3309 &leaf->map_token, &leaf->kaddr,
3310 &leaf->map_start, &leaf->map_len,
3311 KM_USER1);
3312 }
3313
3314 ioff = btrfs_item_offset(leaf, item);
3315 btrfs_set_item_offset(leaf, item, ioff - total_data);
3316 }
3317 if (leaf->map_token) {
3318 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3319 leaf->map_token = NULL;
3320 }
3321
3322 /* shift the items */
3323 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
3324 btrfs_item_nr_offset(slot),
3325 (nritems - slot) * sizeof(struct btrfs_item));
3326
3327 /* shift the data */
3328 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3329 data_end - total_data, btrfs_leaf_data(leaf) +
3330 data_end, old_data - data_end);
3331 data_end = old_data;
3332 }
3333
3334 /* setup the item for the new data */
3335 for (i = 0; i < nr; i++) {
3336 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
3337 btrfs_set_item_key(leaf, &disk_key, slot + i);
3338 item = btrfs_item_nr(leaf, slot + i);
3339 btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
3340 data_end -= data_size[i];
3341 btrfs_set_item_size(leaf, item, data_size[i]);
3342 }
3343 btrfs_set_header_nritems(leaf, nritems + nr);
3344 btrfs_mark_buffer_dirty(leaf);
3345
3346 ret = 0;
3347 if (slot == 0) {
3348 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
3349 ret = fixup_low_keys(trans, root, path, &disk_key, 1);
3350 }
3351
3352 if (btrfs_leaf_free_space(root, leaf) < 0) {
3353 btrfs_print_leaf(root, leaf);
3354 BUG();
3355 }
3356out:
3357 return ret;
3358}
3359
3360/*
3361 * Given a key and some data, insert an item into the tree.
3362 * This does all the path init required, making room in the tree if needed.
3363 */
3364int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
3365 *root, struct btrfs_key *cpu_key, void *data, u32
3366 data_size)
3367{
3368 int ret = 0;
3369 struct btrfs_path *path;
3370 struct extent_buffer *leaf;
3371 unsigned long ptr;
3372
3373 path = btrfs_alloc_path();
3374 BUG_ON(!path);
3375 ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
3376 if (!ret) {
3377 leaf = path->nodes[0];
3378 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
3379 write_extent_buffer(leaf, data, ptr, data_size);
3380 btrfs_mark_buffer_dirty(leaf);
3381 }
3382 btrfs_free_path(path);
3383 return ret;
3384}
3385
3386/*
3387 * delete the pointer from a given node.
3388 *
3389 * the tree should have been previously balanced so the deletion does not
3390 * empty a node.
3391 */
3392static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3393 struct btrfs_path *path, int level, int slot)
3394{
3395 struct extent_buffer *parent = path->nodes[level];
3396 u32 nritems;
3397 int ret = 0;
3398 int wret;
3399
3400 nritems = btrfs_header_nritems(parent);
3401 if (slot != nritems - 1) {
3402 memmove_extent_buffer(parent,
3403 btrfs_node_key_ptr_offset(slot),
3404 btrfs_node_key_ptr_offset(slot + 1),
3405 sizeof(struct btrfs_key_ptr) *
3406 (nritems - slot - 1));
3407 }
3408 nritems--;
3409 btrfs_set_header_nritems(parent, nritems);
3410 if (nritems == 0 && parent == root->node) {
3411 BUG_ON(btrfs_header_level(root->node) != 1);
3412 /* just turn the root into a leaf and break */
3413 btrfs_set_header_level(root->node, 0);
3414 } else if (slot == 0) {
3415 struct btrfs_disk_key disk_key;
3416
3417 btrfs_node_key(parent, &disk_key, 0);
3418 wret = fixup_low_keys(trans, root, path, &disk_key, level + 1);
3419 if (wret)
3420 ret = wret;
3421 }
3422 btrfs_mark_buffer_dirty(parent);
3423 return ret;
3424}
3425
3426/*
3427 * a helper function to delete the leaf pointed to by path->slots[1] and
3428 * path->nodes[1]. bytenr is the node block pointer, but since the callers
3429 * already know it, it is faster to have them pass it down than to
3430 * read it out of the node again.
3431 *
3432 * This deletes the pointer in path->nodes[1] and frees the leaf
3433 * block extent. zero is returned if it all worked out, < 0 otherwise.
3434 *
3435 * The path must have already been setup for deleting the leaf, including
3436 * all the proper balancing. path->nodes[1] must be locked.
3437 */
3438noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
3439 struct btrfs_root *root,
3440 struct btrfs_path *path, u64 bytenr)
3441{
3442 int ret;
3443 u64 root_gen = btrfs_header_generation(path->nodes[1]);
3444
3445 ret = del_ptr(trans, root, path, 1, path->slots[1]);
3446 if (ret)
3447 return ret;
3448
3449 ret = btrfs_free_extent(trans, root, bytenr,
3450 btrfs_level_size(root, 0),
3451 path->nodes[1]->start,
3452 btrfs_header_owner(path->nodes[1]),
3453 root_gen, 0, 1);
3454 return ret;
3455}
3456/*
3457 * delete the item at the leaf level in path. If that empties
3458 * the leaf, remove it from the tree
3459 */
3460int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3461 struct btrfs_path *path, int slot, int nr)
3462{
3463 struct extent_buffer *leaf;
3464 struct btrfs_item *item;
3465 int last_off;
3466 int dsize = 0;
3467 int ret = 0;
3468 int wret;
3469 int i;
3470 u32 nritems;
3471
3472 leaf = path->nodes[0];
3473 last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
3474
3475 for (i = 0; i < nr; i++)
3476 dsize += btrfs_item_size_nr(leaf, slot + i);
3477
3478 nritems = btrfs_header_nritems(leaf);
3479
3480 if (slot + nr != nritems) {
3481 int data_end = leaf_data_end(root, leaf);
3482
3483 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3484 data_end + dsize,
3485 btrfs_leaf_data(leaf) + data_end,
3486 last_off - data_end);
3487
3488 for (i = slot + nr; i < nritems; i++) {
3489 u32 ioff;
3490
3491 item = btrfs_item_nr(leaf, i);
3492 if (!leaf->map_token) {
3493 map_extent_buffer(leaf, (unsigned long)item,
3494 sizeof(struct btrfs_item),
3495 &leaf->map_token, &leaf->kaddr,
3496 &leaf->map_start, &leaf->map_len,
3497 KM_USER1);
3498 }
3499 ioff = btrfs_item_offset(leaf, item);
3500 btrfs_set_item_offset(leaf, item, ioff + dsize);
3501 }
3502
3503 if (leaf->map_token) {
3504 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3505 leaf->map_token = NULL;
3506 }
3507
3508 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
3509 btrfs_item_nr_offset(slot + nr),
3510 sizeof(struct btrfs_item) *
3511 (nritems - slot - nr));
3512 }
3513 btrfs_set_header_nritems(leaf, nritems - nr);
3514 nritems -= nr;
3515
3516 /* delete the leaf if we've emptied it */
3517 if (nritems == 0) {
3518 if (leaf == root->node) {
3519 btrfs_set_header_level(leaf, 0);
3520 } else {
3521 ret = btrfs_del_leaf(trans, root, path, leaf->start);
3522 BUG_ON(ret);
3523 }
3524 } else {
3525 int used = leaf_space_used(leaf, 0, nritems);
3526 if (slot == 0) {
3527 struct btrfs_disk_key disk_key;
3528
3529 btrfs_item_key(leaf, &disk_key, 0);
3530 wret = fixup_low_keys(trans, root, path,
3531 &disk_key, 1);
3532 if (wret)
3533 ret = wret;
3534 }
3535
3536 /* delete the leaf if it is mostly empty */
3537 if (used < BTRFS_LEAF_DATA_SIZE(root) / 4) {
3538 /* push_leaf_left fixes the path.
3539 * make sure the path still points to our leaf
3540 * for possible call to del_ptr below
3541 */
3542 slot = path->slots[1];
3543 extent_buffer_get(leaf);
3544
3545 wret = push_leaf_left(trans, root, path, 1, 1);
3546 if (wret < 0 && wret != -ENOSPC)
3547 ret = wret;
3548
3549 if (path->nodes[0] == leaf &&
3550 btrfs_header_nritems(leaf)) {
3551 wret = push_leaf_right(trans, root, path, 1, 1);
3552 if (wret < 0 && wret != -ENOSPC)
3553 ret = wret;
3554 }
3555
3556 if (btrfs_header_nritems(leaf) == 0) {
3557 path->slots[1] = slot;
3558 ret = btrfs_del_leaf(trans, root, path,
3559 leaf->start);
3560 BUG_ON(ret);
3561 free_extent_buffer(leaf);
3562 } else {
3563 /* if we're still in the path, make sure
3564 * we're dirty. Otherwise, one of the
3565 * push_leaf functions must have already
3566 * dirtied this buffer
3567 */
3568 if (path->nodes[0] == leaf)
3569 btrfs_mark_buffer_dirty(leaf);
3570 free_extent_buffer(leaf);
3571 }
3572 } else {
3573 btrfs_mark_buffer_dirty(leaf);
3574 }
3575 }
3576 return ret;
3577}
3578
3579/*
3580 * search the tree again to find a leaf with lesser keys
3581 * returns 0 if it found something or 1 if there are no lesser leaves.
3582 * returns < 0 on io errors.
3583 *
3584 * This may release the path, and so you may lose any locks held at the
3585 * time you call it.
3586 */
3587int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
3588{
3589 struct btrfs_key key;
3590 struct btrfs_disk_key found_key;
3591 int ret;
3592
3593 btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
3594
3595 if (key.offset > 0)
3596 key.offset--;
3597 else if (key.type > 0)
3598 key.type--;
3599 else if (key.objectid > 0)
3600 key.objectid--;
3601 else
3602 return 1;
3603
3604 btrfs_release_path(root, path);
3605 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3606 if (ret < 0)
3607 return ret;
3608 btrfs_item_key(path->nodes[0], &found_key, 0);
3609 ret = comp_keys(&found_key, &key);
3610 if (ret < 0)
3611 return 0;
3612 return 1;
3613}
3614
3615/*
3616 * A helper function to walk down the tree starting at min_key, and looking
3617 * for nodes or leaves that are either in cache or have a minimum
3618 * transaction id. This is used by the btree defrag code, and tree logging
3619 *
3620 * This does not cow, but it does stuff the starting key it finds back
3621 * into min_key, so you can call btrfs_search_slot with cow=1 on the
3622 * key and get a writable path.
3623 *
3624 * This does lock as it descends, and path->keep_locks should be set
3625 * to 1 by the caller.
3626 *
3627 * This honors path->lowest_level to prevent descent past a given level
3628 * of the tree.
3629 *
3630 * min_trans indicates the oldest transaction that you are interested
3631 * in walking through. Any nodes or leaves older than min_trans are
3632 * skipped over (without reading them).
3633 *
3634 * returns zero if something useful was found, < 0 on error and 1 if there
3635 * was nothing in the tree that matched the search criteria.
3636 */
3637int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
3638 struct btrfs_key *max_key,
3639 struct btrfs_path *path, int cache_only,
3640 u64 min_trans)
3641{
3642 struct extent_buffer *cur;
3643 struct btrfs_key found_key;
3644 int slot;
3645 int sret;
3646 u32 nritems;
3647 int level;
3648 int ret = 1;
3649
3650 WARN_ON(!path->keep_locks);
3651again:
3652 cur = btrfs_lock_root_node(root);
3653 level = btrfs_header_level(cur);
3654 WARN_ON(path->nodes[level]);
3655 path->nodes[level] = cur;
3656 path->locks[level] = 1;
3657
3658 if (btrfs_header_generation(cur) < min_trans) {
3659 ret = 1;
3660 goto out;
3661 }
3662 while (1) {
3663 nritems = btrfs_header_nritems(cur);
3664 level = btrfs_header_level(cur);
3665 sret = bin_search(cur, min_key, level, &slot);
3666
3667 /* at the lowest level, we're done, setup the path and exit */
3668 if (level == path->lowest_level) {
3669 if (slot >= nritems)
3670 goto find_next_key;
3671 ret = 0;
3672 path->slots[level] = slot;
3673 btrfs_item_key_to_cpu(cur, &found_key, slot);
3674 goto out;
3675 }
3676 if (sret && slot > 0)
3677 slot--;
3678 /*
3679 * check this node pointer against the cache_only and
3680 * min_trans parameters. If it isn't in cache or is too
3681 * old, skip to the next one.
3682 */
3683 while (slot < nritems) {
3684 u64 blockptr;
3685 u64 gen;
3686 struct extent_buffer *tmp;
3687 struct btrfs_disk_key disk_key;
3688
3689 blockptr = btrfs_node_blockptr(cur, slot);
3690 gen = btrfs_node_ptr_generation(cur, slot);
3691 if (gen < min_trans) {
3692 slot++;
3693 continue;
3694 }
3695 if (!cache_only)
3696 break;
3697
3698 if (max_key) {
3699 btrfs_node_key(cur, &disk_key, slot);
3700 if (comp_keys(&disk_key, max_key) >= 0) {
3701 ret = 1;
3702 goto out;
3703 }
3704 }
3705
3706 tmp = btrfs_find_tree_block(root, blockptr,
3707 btrfs_level_size(root, level - 1));
3708
3709 if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
3710 free_extent_buffer(tmp);
3711 break;
3712 }
3713 if (tmp)
3714 free_extent_buffer(tmp);
3715 slot++;
3716 }
3717find_next_key:
3718 /*
3719 * we didn't find a candidate key in this node, walk forward
3720 * and find another one
3721 */
3722 if (slot >= nritems) {
3723 path->slots[level] = slot;
3724 sret = btrfs_find_next_key(root, path, min_key, level,
3725 cache_only, min_trans);
3726 if (sret == 0) {
3727 btrfs_release_path(root, path);
3728 goto again;
3729 } else {
3730 goto out;
3731 }
3732 }
3733 /* save our key for returning back */
3734 btrfs_node_key_to_cpu(cur, &found_key, slot);
3735 path->slots[level] = slot;
3736 if (level == path->lowest_level) {
3737 ret = 0;
3738 unlock_up(path, level, 1);
3739 goto out;
3740 }
3741 cur = read_node_slot(root, cur, slot);
3742
3743 btrfs_tree_lock(cur);
3744 path->locks[level - 1] = 1;
3745 path->nodes[level - 1] = cur;
3746 unlock_up(path, level, 1);
3747 }
3748out:
3749 if (ret == 0)
3750 memcpy(min_key, &found_key, sizeof(found_key));
3751 return ret;
3752}
3753
3754/*
3755 * this is similar to btrfs_next_leaf, but does not try to preserve
3756 * and fixup the path. It looks for and returns the next key in the
3757 * tree based on the current path and the cache_only and min_trans
3758 * parameters.
3759 *
3760 * 0 is returned if another key is found, < 0 if there are any errors
3761 * and 1 is returned if there are no higher keys in the tree
3762 *
3763 * path->keep_locks should be set to 1 on the search made before
3764 * calling this function.
3765 */
3766int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
3767 struct btrfs_key *key, int lowest_level,
3768 int cache_only, u64 min_trans)
3769{
3770 int level = lowest_level;
3771 int slot;
3772 struct extent_buffer *c;
3773
3774 WARN_ON(!path->keep_locks);
3775 while (level < BTRFS_MAX_LEVEL) {
3776 if (!path->nodes[level])
3777 return 1;
3778
3779 slot = path->slots[level] + 1;
3780 c = path->nodes[level];
3781next:
3782 if (slot >= btrfs_header_nritems(c)) {
3783 level++;
3784 if (level == BTRFS_MAX_LEVEL)
3785 return 1;
3786 continue;
3787 }
3788 if (level == 0)
3789 btrfs_item_key_to_cpu(c, key, slot);
3790 else {
3791 u64 blockptr = btrfs_node_blockptr(c, slot);
3792 u64 gen = btrfs_node_ptr_generation(c, slot);
3793
3794 if (cache_only) {
3795 struct extent_buffer *cur;
3796 cur = btrfs_find_tree_block(root, blockptr,
3797 btrfs_level_size(root, level - 1));
3798 if (!cur || !btrfs_buffer_uptodate(cur, gen)) {
3799 slot++;
3800 if (cur)
3801 free_extent_buffer(cur);
3802 goto next;
3803 }
3804 free_extent_buffer(cur);
3805 }
3806 if (gen < min_trans) {
3807 slot++;
3808 goto next;
3809 }
3810 btrfs_node_key_to_cpu(c, key, slot);
3811 }
3812 return 0;
3813 }
3814 return 1;
3815}
3816
3817/*
3818 * search the tree again to find a leaf with greater keys
3819 * returns 0 if it found something or 1 if there are no greater leaves.
3820 * returns < 0 on io errors.
3821 */
3822int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
3823{
3824 int slot;
3825 int level = 1;
3826 struct extent_buffer *c;
3827 struct extent_buffer *next = NULL;
3828 struct btrfs_key key;
3829 u32 nritems;
3830 int ret;
3831
3832 nritems = btrfs_header_nritems(path->nodes[0]);
3833 if (nritems == 0)
3834 return 1;
3835
3836 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
3837
3838 btrfs_release_path(root, path);
3839 path->keep_locks = 1;
3840 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3841 path->keep_locks = 0;
3842
3843 if (ret < 0)
3844 return ret;
3845
3846 nritems = btrfs_header_nritems(path->nodes[0]);
3847 /*
3848 * by releasing the path above we dropped all our locks. A balance
3849 * could have added more items next to the key that used to be
3850 * at the very end of the block. So, check again here and
3851 * advance the path if there are now more items available.
3852 */
3853 if (nritems > 0 && path->slots[0] < nritems - 1) {
3854 path->slots[0]++;
3855 goto done;
3856 }
3857
3858 while (level < BTRFS_MAX_LEVEL) {
3859 if (!path->nodes[level])
3860 return 1;
3861
3862 slot = path->slots[level] + 1;
3863 c = path->nodes[level];
3864 if (slot >= btrfs_header_nritems(c)) {
3865 level++;
3866 if (level == BTRFS_MAX_LEVEL)
3867 return 1;
3868 continue;
3869 }
3870
3871 if (next) {
3872 btrfs_tree_unlock(next);
3873 free_extent_buffer(next);
3874 }
3875
3876 if (level == 1 && (path->locks[1] || path->skip_locking) &&
3877 path->reada)
3878 reada_for_search(root, path, level, slot, 0);
3879
3880 next = read_node_slot(root, c, slot);
3881 if (!path->skip_locking) {
3882 WARN_ON(!btrfs_tree_locked(c));
3883 btrfs_tree_lock(next);
3884 }
3885 break;
3886 }
3887 path->slots[level] = slot;
3888 while (1) {
3889 level--;
3890 c = path->nodes[level];
3891 if (path->locks[level])
3892 btrfs_tree_unlock(c);
3893 free_extent_buffer(c);
3894 path->nodes[level] = next;
3895 path->slots[level] = 0;
3896 if (!path->skip_locking)
3897 path->locks[level] = 1;
3898 if (!level)
3899 break;
3900 if (level == 1 && path->locks[1] && path->reada)
3901 reada_for_search(root, path, level, slot, 0);
3902 next = read_node_slot(root, next, 0);
3903 if (!path->skip_locking) {
3904 WARN_ON(!btrfs_tree_locked(path->nodes[level]));
3905 btrfs_tree_lock(next);
3906 }
3907 }
3908done:
3909 unlock_up(path, 0, 1);
3910 return 0;
3911}
3912
3913/*
3914 * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
3915 * searching until it gets past min_objectid or finds an item of 'type'
3916 *
3917 * returns 0 if something is found, 1 if nothing was found and < 0 on error
3918 */
3919int btrfs_previous_item(struct btrfs_root *root,
3920 struct btrfs_path *path, u64 min_objectid,
3921 int type)
3922{
3923 struct btrfs_key found_key;
3924 struct extent_buffer *leaf;
3925 u32 nritems;
3926 int ret;
3927
3928 while (1) {
3929 if (path->slots[0] == 0) {
3930 ret = btrfs_prev_leaf(root, path);
3931 if (ret != 0)
3932 return ret;
3933 } else {
3934 path->slots[0]--;
3935 }
3936 leaf = path->nodes[0];
3937 nritems = btrfs_header_nritems(leaf);
3938 if (nritems == 0)
3939 return 1;
3940 if (path->slots[0] == nritems)
3941 path->slots[0]--;
3942
3943 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3944 if (found_key.type == type)
3945 return 0;
3946 if (found_key.objectid < min_objectid)
3947 break;
3948 if (found_key.objectid == min_objectid &&
3949 found_key.type < type)
3950 break;
3951 }
3952 return 1;
3953}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
new file mode 100644
index 000000000000..eee060f88113
--- /dev/null
+++ b/fs/btrfs/ctree.h
@@ -0,0 +1,2129 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_CTREE__
20#define __BTRFS_CTREE__
21
22#include <linux/version.h>
23#include <linux/mm.h>
24#include <linux/highmem.h>
25#include <linux/fs.h>
26#include <linux/completion.h>
27#include <linux/backing-dev.h>
28#include <linux/wait.h>
29#include <asm/kmap_types.h>
30#include "extent_io.h"
31#include "extent_map.h"
32#include "async-thread.h"
33
34struct btrfs_trans_handle;
35struct btrfs_transaction;
36extern struct kmem_cache *btrfs_trans_handle_cachep;
37extern struct kmem_cache *btrfs_transaction_cachep;
38extern struct kmem_cache *btrfs_bit_radix_cachep;
39extern struct kmem_cache *btrfs_path_cachep;
40struct btrfs_ordered_sum;
41
42#define BTRFS_MAGIC "_BHRfS_M"
43
44#define BTRFS_ACL_NOT_CACHED ((void *)-1)
45
46#ifdef CONFIG_LOCKDEP
47# define BTRFS_MAX_LEVEL 7
48#else
49# define BTRFS_MAX_LEVEL 8
50#endif
51
52/* holds pointers to all of the tree roots */
53#define BTRFS_ROOT_TREE_OBJECTID 1ULL
54
55/* stores information about which extents are in use, and reference counts */
56#define BTRFS_EXTENT_TREE_OBJECTID 2ULL
57
58/*
59 * chunk tree stores translations from logical -> physical block numbering
60 * the super block points to the chunk tree
61 */
62#define BTRFS_CHUNK_TREE_OBJECTID 3ULL
63
64/*
65 * stores information about which areas of a given device are in use.
66 * one per device. The tree of tree roots points to the device tree
67 */
68#define BTRFS_DEV_TREE_OBJECTID 4ULL
69
70/* one per subvolume, storing files and directories */
71#define BTRFS_FS_TREE_OBJECTID 5ULL
72
73/* directory objectid inside the root tree */
74#define BTRFS_ROOT_TREE_DIR_OBJECTID 6ULL
75
76/* holds checksums of all the data extents */
77#define BTRFS_CSUM_TREE_OBJECTID 7ULL
78
79/* orhpan objectid for tracking unlinked/truncated files */
80#define BTRFS_ORPHAN_OBJECTID -5ULL
81
82/* does write ahead logging to speed up fsyncs */
83#define BTRFS_TREE_LOG_OBJECTID -6ULL
84#define BTRFS_TREE_LOG_FIXUP_OBJECTID -7ULL
85
86/* for space balancing */
87#define BTRFS_TREE_RELOC_OBJECTID -8ULL
88#define BTRFS_DATA_RELOC_TREE_OBJECTID -9ULL
89
90/*
91 * extent checksums all have this objectid
92 * this allows them to share the logging tree
93 * for fsyncs
94 */
95#define BTRFS_EXTENT_CSUM_OBJECTID -10ULL
96
97/* dummy objectid represents multiple objectids */
98#define BTRFS_MULTIPLE_OBJECTIDS -255ULL
99
100/*
101 * All files have objectids in this range.
102 */
103#define BTRFS_FIRST_FREE_OBJECTID 256ULL
104#define BTRFS_LAST_FREE_OBJECTID -256ULL
105#define BTRFS_FIRST_CHUNK_TREE_OBJECTID 256ULL
106
107
108/*
109 * the device items go into the chunk tree. The key is in the form
110 * [ 1 BTRFS_DEV_ITEM_KEY device_id ]
111 */
112#define BTRFS_DEV_ITEMS_OBJECTID 1ULL
113
114/*
115 * we can actually store much bigger names, but lets not confuse the rest
116 * of linux
117 */
118#define BTRFS_NAME_LEN 255
119
120/* 32 bytes in various csum fields */
121#define BTRFS_CSUM_SIZE 32
122
123/* csum types */
124#define BTRFS_CSUM_TYPE_CRC32 0
125
126static int btrfs_csum_sizes[] = { 4, 0 };
127
128/* four bytes for CRC32 */
129#define BTRFS_EMPTY_DIR_SIZE 0
130
131#define BTRFS_FT_UNKNOWN 0
132#define BTRFS_FT_REG_FILE 1
133#define BTRFS_FT_DIR 2
134#define BTRFS_FT_CHRDEV 3
135#define BTRFS_FT_BLKDEV 4
136#define BTRFS_FT_FIFO 5
137#define BTRFS_FT_SOCK 6
138#define BTRFS_FT_SYMLINK 7
139#define BTRFS_FT_XATTR 8
140#define BTRFS_FT_MAX 9
141
142/*
143 * the key defines the order in the tree, and so it also defines (optimal)
144 * block layout. objectid corresonds to the inode number. The flags
145 * tells us things about the object, and is a kind of stream selector.
146 * so for a given inode, keys with flags of 1 might refer to the inode
147 * data, flags of 2 may point to file data in the btree and flags == 3
148 * may point to extents.
149 *
150 * offset is the starting byte offset for this key in the stream.
151 *
152 * btrfs_disk_key is in disk byte order. struct btrfs_key is always
153 * in cpu native order. Otherwise they are identical and their sizes
154 * should be the same (ie both packed)
155 */
156struct btrfs_disk_key {
157 __le64 objectid;
158 u8 type;
159 __le64 offset;
160} __attribute__ ((__packed__));
161
162struct btrfs_key {
163 u64 objectid;
164 u8 type;
165 u64 offset;
166} __attribute__ ((__packed__));
167
168struct btrfs_mapping_tree {
169 struct extent_map_tree map_tree;
170};
171
172#define BTRFS_UUID_SIZE 16
173struct btrfs_dev_item {
174 /* the internal btrfs device id */
175 __le64 devid;
176
177 /* size of the device */
178 __le64 total_bytes;
179
180 /* bytes used */
181 __le64 bytes_used;
182
183 /* optimal io alignment for this device */
184 __le32 io_align;
185
186 /* optimal io width for this device */
187 __le32 io_width;
188
189 /* minimal io size for this device */
190 __le32 sector_size;
191
192 /* type and info about this device */
193 __le64 type;
194
195 /* expected generation for this device */
196 __le64 generation;
197
198 /*
199 * starting byte of this partition on the device,
200 * to allowr for stripe alignment in the future
201 */
202 __le64 start_offset;
203
204 /* grouping information for allocation decisions */
205 __le32 dev_group;
206
207 /* seek speed 0-100 where 100 is fastest */
208 u8 seek_speed;
209
210 /* bandwidth 0-100 where 100 is fastest */
211 u8 bandwidth;
212
213 /* btrfs generated uuid for this device */
214 u8 uuid[BTRFS_UUID_SIZE];
215
216 /* uuid of FS who owns this device */
217 u8 fsid[BTRFS_UUID_SIZE];
218} __attribute__ ((__packed__));
219
220struct btrfs_stripe {
221 __le64 devid;
222 __le64 offset;
223 u8 dev_uuid[BTRFS_UUID_SIZE];
224} __attribute__ ((__packed__));
225
226struct btrfs_chunk {
227 /* size of this chunk in bytes */
228 __le64 length;
229
230 /* objectid of the root referencing this chunk */
231 __le64 owner;
232
233 __le64 stripe_len;
234 __le64 type;
235
236 /* optimal io alignment for this chunk */
237 __le32 io_align;
238
239 /* optimal io width for this chunk */
240 __le32 io_width;
241
242 /* minimal io size for this chunk */
243 __le32 sector_size;
244
245 /* 2^16 stripes is quite a lot, a second limit is the size of a single
246 * item in the btree
247 */
248 __le16 num_stripes;
249
250 /* sub stripes only matter for raid10 */
251 __le16 sub_stripes;
252 struct btrfs_stripe stripe;
253 /* additional stripes go here */
254} __attribute__ ((__packed__));
255
256static inline unsigned long btrfs_chunk_item_size(int num_stripes)
257{
258 BUG_ON(num_stripes == 0);
259 return sizeof(struct btrfs_chunk) +
260 sizeof(struct btrfs_stripe) * (num_stripes - 1);
261}
262
263#define BTRFS_FSID_SIZE 16
264#define BTRFS_HEADER_FLAG_WRITTEN (1 << 0)
265
266/*
267 * every tree block (leaf or node) starts with this header.
268 */
269struct btrfs_header {
270 /* these first four must match the super block */
271 u8 csum[BTRFS_CSUM_SIZE];
272 u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
273 __le64 bytenr; /* which block this node is supposed to live in */
274 __le64 flags;
275
276 /* allowed to be different from the super from here on down */
277 u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
278 __le64 generation;
279 __le64 owner;
280 __le32 nritems;
281 u8 level;
282} __attribute__ ((__packed__));
283
284#define BTRFS_NODEPTRS_PER_BLOCK(r) (((r)->nodesize - \
285 sizeof(struct btrfs_header)) / \
286 sizeof(struct btrfs_key_ptr))
287#define __BTRFS_LEAF_DATA_SIZE(bs) ((bs) - sizeof(struct btrfs_header))
288#define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->leafsize))
289#define BTRFS_MAX_INLINE_DATA_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \
290 sizeof(struct btrfs_item) - \
291 sizeof(struct btrfs_file_extent_item))
292
293#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32)
294
295/*
296 * this is a very generous portion of the super block, giving us
297 * room to translate 14 chunks with 3 stripes each.
298 */
299#define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048
300#define BTRFS_LABEL_SIZE 256
301
302/*
303 * the super block basically lists the main trees of the FS
304 * it currently lacks any block count etc etc
305 */
306struct btrfs_super_block {
307 u8 csum[BTRFS_CSUM_SIZE];
308 /* the first 4 fields must match struct btrfs_header */
309 u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
310 __le64 bytenr; /* this block number */
311 __le64 flags;
312
313 /* allowed to be different from the btrfs_header from here own down */
314 __le64 magic;
315 __le64 generation;
316 __le64 root;
317 __le64 chunk_root;
318 __le64 log_root;
319
320 /* this will help find the new super based on the log root */
321 __le64 log_root_transid;
322 __le64 total_bytes;
323 __le64 bytes_used;
324 __le64 root_dir_objectid;
325 __le64 num_devices;
326 __le32 sectorsize;
327 __le32 nodesize;
328 __le32 leafsize;
329 __le32 stripesize;
330 __le32 sys_chunk_array_size;
331 __le64 chunk_root_generation;
332 __le64 compat_flags;
333 __le64 compat_ro_flags;
334 __le64 incompat_flags;
335 __le16 csum_type;
336 u8 root_level;
337 u8 chunk_root_level;
338 u8 log_root_level;
339 struct btrfs_dev_item dev_item;
340
341 char label[BTRFS_LABEL_SIZE];
342
343 /* future expansion */
344 __le64 reserved[32];
345 u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
346} __attribute__ ((__packed__));
347
348/*
349 * Compat flags that we support. If any incompat flags are set other than the
350 * ones specified below then we will fail to mount
351 */
352#define BTRFS_FEATURE_COMPAT_SUPP 0x0
353#define BTRFS_FEATURE_COMPAT_RO_SUPP 0x0
354#define BTRFS_FEATURE_INCOMPAT_SUPP 0x0
355
356/*
357 * A leaf is full of items. offset and size tell us where to find
358 * the item in the leaf (relative to the start of the data area)
359 */
360struct btrfs_item {
361 struct btrfs_disk_key key;
362 __le32 offset;
363 __le32 size;
364} __attribute__ ((__packed__));
365
366/*
367 * leaves have an item area and a data area:
368 * [item0, item1....itemN] [free space] [dataN...data1, data0]
369 *
370 * The data is separate from the items to get the keys closer together
371 * during searches.
372 */
373struct btrfs_leaf {
374 struct btrfs_header header;
375 struct btrfs_item items[];
376} __attribute__ ((__packed__));
377
378/*
379 * all non-leaf blocks are nodes, they hold only keys and pointers to
380 * other blocks
381 */
382struct btrfs_key_ptr {
383 struct btrfs_disk_key key;
384 __le64 blockptr;
385 __le64 generation;
386} __attribute__ ((__packed__));
387
388struct btrfs_node {
389 struct btrfs_header header;
390 struct btrfs_key_ptr ptrs[];
391} __attribute__ ((__packed__));
392
393/*
394 * btrfs_paths remember the path taken from the root down to the leaf.
395 * level 0 is always the leaf, and nodes[1...BTRFS_MAX_LEVEL] will point
396 * to any other levels that are present.
397 *
398 * The slots array records the index of the item or block pointer
399 * used while walking the tree.
400 */
401struct btrfs_path {
402 struct extent_buffer *nodes[BTRFS_MAX_LEVEL];
403 int slots[BTRFS_MAX_LEVEL];
404 /* if there is real range locking, this locks field will change */
405 int locks[BTRFS_MAX_LEVEL];
406 int reada;
407 /* keep some upper locks as we walk down */
408 int keep_locks;
409 int skip_locking;
410 int lowest_level;
411
412 /*
413 * set by btrfs_split_item, tells search_slot to keep all locks
414 * and to force calls to keep space in the nodes
415 */
416 int search_for_split;
417};
418
419/*
420 * items in the extent btree are used to record the objectid of the
421 * owner of the block and the number of references
422 */
423struct btrfs_extent_item {
424 __le32 refs;
425} __attribute__ ((__packed__));
426
427struct btrfs_extent_ref {
428 __le64 root;
429 __le64 generation;
430 __le64 objectid;
431 __le32 num_refs;
432} __attribute__ ((__packed__));
433
434/* dev extents record free space on individual devices. The owner
435 * field points back to the chunk allocation mapping tree that allocated
436 * the extent. The chunk tree uuid field is a way to double check the owner
437 */
438struct btrfs_dev_extent {
439 __le64 chunk_tree;
440 __le64 chunk_objectid;
441 __le64 chunk_offset;
442 __le64 length;
443 u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
444} __attribute__ ((__packed__));
445
446struct btrfs_inode_ref {
447 __le64 index;
448 __le16 name_len;
449 /* name goes here */
450} __attribute__ ((__packed__));
451
452struct btrfs_timespec {
453 __le64 sec;
454 __le32 nsec;
455} __attribute__ ((__packed__));
456
457typedef enum {
458 BTRFS_COMPRESS_NONE = 0,
459 BTRFS_COMPRESS_ZLIB = 1,
460 BTRFS_COMPRESS_LAST = 2,
461} btrfs_compression_type;
462
463/* we don't understand any encryption methods right now */
464typedef enum {
465 BTRFS_ENCRYPTION_NONE = 0,
466 BTRFS_ENCRYPTION_LAST = 1,
467} btrfs_encryption_type;
468
469struct btrfs_inode_item {
470 /* nfs style generation number */
471 __le64 generation;
472 /* transid that last touched this inode */
473 __le64 transid;
474 __le64 size;
475 __le64 nbytes;
476 __le64 block_group;
477 __le32 nlink;
478 __le32 uid;
479 __le32 gid;
480 __le32 mode;
481 __le64 rdev;
482 __le64 flags;
483
484 /* modification sequence number for NFS */
485 __le64 sequence;
486
487 /*
488 * a little future expansion, for more than this we can
489 * just grow the inode item and version it
490 */
491 __le64 reserved[4];
492 struct btrfs_timespec atime;
493 struct btrfs_timespec ctime;
494 struct btrfs_timespec mtime;
495 struct btrfs_timespec otime;
496} __attribute__ ((__packed__));
497
498struct btrfs_dir_log_item {
499 __le64 end;
500} __attribute__ ((__packed__));
501
502struct btrfs_dir_item {
503 struct btrfs_disk_key location;
504 __le64 transid;
505 __le16 data_len;
506 __le16 name_len;
507 u8 type;
508} __attribute__ ((__packed__));
509
510struct btrfs_root_item {
511 struct btrfs_inode_item inode;
512 __le64 generation;
513 __le64 root_dirid;
514 __le64 bytenr;
515 __le64 byte_limit;
516 __le64 bytes_used;
517 __le64 last_snapshot;
518 __le64 flags;
519 __le32 refs;
520 struct btrfs_disk_key drop_progress;
521 u8 drop_level;
522 u8 level;
523} __attribute__ ((__packed__));
524
525/*
526 * this is used for both forward and backward root refs
527 */
528struct btrfs_root_ref {
529 __le64 dirid;
530 __le64 sequence;
531 __le16 name_len;
532} __attribute__ ((__packed__));
533
534#define BTRFS_FILE_EXTENT_INLINE 0
535#define BTRFS_FILE_EXTENT_REG 1
536#define BTRFS_FILE_EXTENT_PREALLOC 2
537
538struct btrfs_file_extent_item {
539 /*
540 * transaction id that created this extent
541 */
542 __le64 generation;
543 /*
544 * max number of bytes to hold this extent in ram
545 * when we split a compressed extent we can't know how big
546 * each of the resulting pieces will be. So, this is
547 * an upper limit on the size of the extent in ram instead of
548 * an exact limit.
549 */
550 __le64 ram_bytes;
551
552 /*
553 * 32 bits for the various ways we might encode the data,
554 * including compression and encryption. If any of these
555 * are set to something a given disk format doesn't understand
556 * it is treated like an incompat flag for reading and writing,
557 * but not for stat.
558 */
559 u8 compression;
560 u8 encryption;
561 __le16 other_encoding; /* spare for later use */
562
563 /* are we inline data or a real extent? */
564 u8 type;
565
566 /*
567 * disk space consumed by the extent, checksum blocks are included
568 * in these numbers
569 */
570 __le64 disk_bytenr;
571 __le64 disk_num_bytes;
572 /*
573 * the logical offset in file blocks (no csums)
574 * this extent record is for. This allows a file extent to point
575 * into the middle of an existing extent on disk, sharing it
576 * between two snapshots (useful if some bytes in the middle of the
577 * extent have changed
578 */
579 __le64 offset;
580 /*
581 * the logical number of file blocks (no csums included). This
582 * always reflects the size uncompressed and without encoding.
583 */
584 __le64 num_bytes;
585
586} __attribute__ ((__packed__));
587
588struct btrfs_csum_item {
589 u8 csum;
590} __attribute__ ((__packed__));
591
592/* different types of block groups (and chunks) */
593#define BTRFS_BLOCK_GROUP_DATA (1 << 0)
594#define BTRFS_BLOCK_GROUP_SYSTEM (1 << 1)
595#define BTRFS_BLOCK_GROUP_METADATA (1 << 2)
596#define BTRFS_BLOCK_GROUP_RAID0 (1 << 3)
597#define BTRFS_BLOCK_GROUP_RAID1 (1 << 4)
598#define BTRFS_BLOCK_GROUP_DUP (1 << 5)
599#define BTRFS_BLOCK_GROUP_RAID10 (1 << 6)
600
601struct btrfs_block_group_item {
602 __le64 used;
603 __le64 chunk_objectid;
604 __le64 flags;
605} __attribute__ ((__packed__));
606
607struct btrfs_space_info {
608 u64 flags;
609 u64 total_bytes;
610 u64 bytes_used;
611 u64 bytes_pinned;
612 u64 bytes_reserved;
613 u64 bytes_readonly;
614 int full;
615 int force_alloc;
616 struct list_head list;
617
618 /* for block groups in our same type */
619 struct list_head block_groups;
620 spinlock_t lock;
621 struct rw_semaphore groups_sem;
622};
623
624struct btrfs_free_space {
625 struct rb_node bytes_index;
626 struct rb_node offset_index;
627 u64 offset;
628 u64 bytes;
629};
630
631struct btrfs_block_group_cache {
632 struct btrfs_key key;
633 struct btrfs_block_group_item item;
634 spinlock_t lock;
635 struct mutex alloc_mutex;
636 struct mutex cache_mutex;
637 u64 pinned;
638 u64 reserved;
639 u64 flags;
640 int cached;
641 int ro;
642 int dirty;
643
644 struct btrfs_space_info *space_info;
645
646 /* free space cache stuff */
647 struct rb_root free_space_bytes;
648 struct rb_root free_space_offset;
649
650 /* block group cache stuff */
651 struct rb_node cache_node;
652
653 /* for block groups in the same raid type */
654 struct list_head list;
655
656 /* usage count */
657 atomic_t count;
658};
659
660struct btrfs_leaf_ref_tree {
661 struct rb_root root;
662 struct list_head list;
663 spinlock_t lock;
664};
665
666struct btrfs_device;
667struct btrfs_fs_devices;
668struct btrfs_fs_info {
669 u8 fsid[BTRFS_FSID_SIZE];
670 u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
671 struct btrfs_root *extent_root;
672 struct btrfs_root *tree_root;
673 struct btrfs_root *chunk_root;
674 struct btrfs_root *dev_root;
675 struct btrfs_root *fs_root;
676 struct btrfs_root *csum_root;
677
678 /* the log root tree is a directory of all the other log roots */
679 struct btrfs_root *log_root_tree;
680 struct radix_tree_root fs_roots_radix;
681
682 /* block group cache stuff */
683 spinlock_t block_group_cache_lock;
684 struct rb_root block_group_cache_tree;
685
686 struct extent_io_tree pinned_extents;
687 struct extent_io_tree pending_del;
688 struct extent_io_tree extent_ins;
689
690 /* logical->physical extent mapping */
691 struct btrfs_mapping_tree mapping_tree;
692
693 u64 generation;
694 u64 last_trans_committed;
695 u64 last_trans_new_blockgroup;
696 u64 open_ioctl_trans;
697 unsigned long mount_opt;
698 u64 max_extent;
699 u64 max_inline;
700 u64 alloc_start;
701 struct btrfs_transaction *running_transaction;
702 wait_queue_head_t transaction_throttle;
703 wait_queue_head_t transaction_wait;
704
705 wait_queue_head_t async_submit_wait;
706 wait_queue_head_t tree_log_wait;
707
708 struct btrfs_super_block super_copy;
709 struct btrfs_super_block super_for_commit;
710 struct block_device *__bdev;
711 struct super_block *sb;
712 struct inode *btree_inode;
713 struct backing_dev_info bdi;
714 spinlock_t hash_lock;
715 struct mutex trans_mutex;
716 struct mutex tree_log_mutex;
717 struct mutex transaction_kthread_mutex;
718 struct mutex cleaner_mutex;
719 struct mutex extent_ins_mutex;
720 struct mutex pinned_mutex;
721 struct mutex chunk_mutex;
722 struct mutex drop_mutex;
723 struct mutex volume_mutex;
724 struct mutex tree_reloc_mutex;
725 struct list_head trans_list;
726 struct list_head hashers;
727 struct list_head dead_roots;
728
729 atomic_t nr_async_submits;
730 atomic_t async_submit_draining;
731 atomic_t nr_async_bios;
732 atomic_t async_delalloc_pages;
733 atomic_t tree_log_writers;
734 atomic_t tree_log_commit;
735 unsigned long tree_log_batch;
736 u64 tree_log_transid;
737
738 /*
739 * this is used by the balancing code to wait for all the pending
740 * ordered extents
741 */
742 spinlock_t ordered_extent_lock;
743 struct list_head ordered_extents;
744 struct list_head delalloc_inodes;
745
746 /*
747 * there is a pool of worker threads for checksumming during writes
748 * and a pool for checksumming after reads. This is because readers
749 * can run with FS locks held, and the writers may be waiting for
750 * those locks. We don't want ordering in the pending list to cause
751 * deadlocks, and so the two are serviced separately.
752 *
753 * A third pool does submit_bio to avoid deadlocking with the other
754 * two
755 */
756 struct btrfs_workers workers;
757 struct btrfs_workers delalloc_workers;
758 struct btrfs_workers endio_workers;
759 struct btrfs_workers endio_meta_workers;
760 struct btrfs_workers endio_meta_write_workers;
761 struct btrfs_workers endio_write_workers;
762 struct btrfs_workers submit_workers;
763 /*
764 * fixup workers take dirty pages that didn't properly go through
765 * the cow mechanism and make them safe to write. It happens
766 * for the sys_munmap function call path
767 */
768 struct btrfs_workers fixup_workers;
769 struct task_struct *transaction_kthread;
770 struct task_struct *cleaner_kthread;
771 int thread_pool_size;
772
773 /* tree relocation relocated fields */
774 struct list_head dead_reloc_roots;
775 struct btrfs_leaf_ref_tree reloc_ref_tree;
776 struct btrfs_leaf_ref_tree shared_ref_tree;
777
778 struct kobject super_kobj;
779 struct completion kobj_unregister;
780 int do_barriers;
781 int closing;
782 int log_root_recovering;
783 atomic_t throttles;
784 atomic_t throttle_gen;
785
786 u64 total_pinned;
787 struct list_head dirty_cowonly_roots;
788
789 struct btrfs_fs_devices *fs_devices;
790 struct list_head space_info;
791 spinlock_t delalloc_lock;
792 spinlock_t new_trans_lock;
793 u64 delalloc_bytes;
794 u64 last_alloc;
795 u64 last_data_alloc;
796
797 spinlock_t ref_cache_lock;
798 u64 total_ref_cache_size;
799
800 u64 avail_data_alloc_bits;
801 u64 avail_metadata_alloc_bits;
802 u64 avail_system_alloc_bits;
803 u64 data_alloc_profile;
804 u64 metadata_alloc_profile;
805 u64 system_alloc_profile;
806
807 void *bdev_holder;
808};
809
810/*
811 * in ram representation of the tree. extent_root is used for all allocations
812 * and for the extent tree extent_root root.
813 */
814struct btrfs_dirty_root;
815struct btrfs_root {
816 struct extent_buffer *node;
817
818 /* the node lock is held while changing the node pointer */
819 spinlock_t node_lock;
820
821 struct extent_buffer *commit_root;
822 struct btrfs_leaf_ref_tree *ref_tree;
823 struct btrfs_leaf_ref_tree ref_tree_struct;
824 struct btrfs_dirty_root *dirty_root;
825 struct btrfs_root *log_root;
826 struct btrfs_root *reloc_root;
827
828 struct btrfs_root_item root_item;
829 struct btrfs_key root_key;
830 struct btrfs_fs_info *fs_info;
831 struct extent_io_tree dirty_log_pages;
832
833 struct kobject root_kobj;
834 struct completion kobj_unregister;
835 struct mutex objectid_mutex;
836 struct mutex log_mutex;
837
838 u64 objectid;
839 u64 last_trans;
840
841 /* data allocations are done in sectorsize units */
842 u32 sectorsize;
843
844 /* node allocations are done in nodesize units */
845 u32 nodesize;
846
847 /* leaf allocations are done in leafsize units */
848 u32 leafsize;
849
850 u32 stripesize;
851
852 u32 type;
853 u64 highest_inode;
854 u64 last_inode_alloc;
855 int ref_cows;
856 int track_dirty;
857 u64 defrag_trans_start;
858 struct btrfs_key defrag_progress;
859 struct btrfs_key defrag_max;
860 int defrag_running;
861 int defrag_level;
862 char *name;
863 int in_sysfs;
864
865 /* the dirty list is only used by non-reference counted roots */
866 struct list_head dirty_list;
867
868 spinlock_t list_lock;
869 struct list_head dead_list;
870 struct list_head orphan_list;
871
872 /*
873 * right now this just gets used so that a root has its own devid
874 * for stat. It may be used for more later
875 */
876 struct super_block anon_super;
877};
878
879/*
880
881 * inode items have the data typically returned from stat and store other
882 * info about object characteristics. There is one for every file and dir in
883 * the FS
884 */
885#define BTRFS_INODE_ITEM_KEY 1
886#define BTRFS_INODE_REF_KEY 12
887#define BTRFS_XATTR_ITEM_KEY 24
888#define BTRFS_ORPHAN_ITEM_KEY 48
889/* reserve 2-15 close to the inode for later flexibility */
890
891/*
892 * dir items are the name -> inode pointers in a directory. There is one
893 * for every name in a directory.
894 */
895#define BTRFS_DIR_LOG_ITEM_KEY 60
896#define BTRFS_DIR_LOG_INDEX_KEY 72
897#define BTRFS_DIR_ITEM_KEY 84
898#define BTRFS_DIR_INDEX_KEY 96
899/*
900 * extent data is for file data
901 */
902#define BTRFS_EXTENT_DATA_KEY 108
903
904/*
905 * extent csums are stored in a separate tree and hold csums for
906 * an entire extent on disk.
907 */
908#define BTRFS_EXTENT_CSUM_KEY 128
909
910/*
911 * root items point to tree roots. There are typically in the root
912 * tree used by the super block to find all the other trees
913 */
914#define BTRFS_ROOT_ITEM_KEY 132
915
916/*
917 * root backrefs tie subvols and snapshots to the directory entries that
918 * reference them
919 */
920#define BTRFS_ROOT_BACKREF_KEY 144
921
922/*
923 * root refs make a fast index for listing all of the snapshots and
924 * subvolumes referenced by a given root. They point directly to the
925 * directory item in the root that references the subvol
926 */
927#define BTRFS_ROOT_REF_KEY 156
928
929/*
930 * extent items are in the extent map tree. These record which blocks
931 * are used, and how many references there are to each block
932 */
933#define BTRFS_EXTENT_ITEM_KEY 168
934#define BTRFS_EXTENT_REF_KEY 180
935
936/*
937 * block groups give us hints into the extent allocation trees. Which
938 * blocks are free etc etc
939 */
940#define BTRFS_BLOCK_GROUP_ITEM_KEY 192
941
942#define BTRFS_DEV_EXTENT_KEY 204
943#define BTRFS_DEV_ITEM_KEY 216
944#define BTRFS_CHUNK_ITEM_KEY 228
945
946/*
947 * string items are for debugging. They just store a short string of
948 * data in the FS
949 */
950#define BTRFS_STRING_ITEM_KEY 253
951
952#define BTRFS_MOUNT_NODATASUM (1 << 0)
953#define BTRFS_MOUNT_NODATACOW (1 << 1)
954#define BTRFS_MOUNT_NOBARRIER (1 << 2)
955#define BTRFS_MOUNT_SSD (1 << 3)
956#define BTRFS_MOUNT_DEGRADED (1 << 4)
957#define BTRFS_MOUNT_COMPRESS (1 << 5)
958
959#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
960#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
961#define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \
962 BTRFS_MOUNT_##opt)
963/*
964 * Inode flags
965 */
966#define BTRFS_INODE_NODATASUM (1 << 0)
967#define BTRFS_INODE_NODATACOW (1 << 1)
968#define BTRFS_INODE_READONLY (1 << 2)
969#define BTRFS_INODE_NOCOMPRESS (1 << 3)
970#define BTRFS_INODE_PREALLOC (1 << 4)
971#define btrfs_clear_flag(inode, flag) (BTRFS_I(inode)->flags &= \
972 ~BTRFS_INODE_##flag)
973#define btrfs_set_flag(inode, flag) (BTRFS_I(inode)->flags |= \
974 BTRFS_INODE_##flag)
975#define btrfs_test_flag(inode, flag) (BTRFS_I(inode)->flags & \
976 BTRFS_INODE_##flag)
977/* some macros to generate set/get funcs for the struct fields. This
978 * assumes there is a lefoo_to_cpu for every type, so lets make a simple
979 * one for u8:
980 */
981#define le8_to_cpu(v) (v)
982#define cpu_to_le8(v) (v)
983#define __le8 u8
984
985#define read_eb_member(eb, ptr, type, member, result) ( \
986 read_extent_buffer(eb, (char *)(result), \
987 ((unsigned long)(ptr)) + \
988 offsetof(type, member), \
989 sizeof(((type *)0)->member)))
990
991#define write_eb_member(eb, ptr, type, member, result) ( \
992 write_extent_buffer(eb, (char *)(result), \
993 ((unsigned long)(ptr)) + \
994 offsetof(type, member), \
995 sizeof(((type *)0)->member)))
996
997#ifndef BTRFS_SETGET_FUNCS
998#define BTRFS_SETGET_FUNCS(name, type, member, bits) \
999u##bits btrfs_##name(struct extent_buffer *eb, type *s); \
1000void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val);
1001#endif
1002
1003#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \
1004static inline u##bits btrfs_##name(struct extent_buffer *eb) \
1005{ \
1006 type *p = kmap_atomic(eb->first_page, KM_USER0); \
1007 u##bits res = le##bits##_to_cpu(p->member); \
1008 kunmap_atomic(p, KM_USER0); \
1009 return res; \
1010} \
1011static inline void btrfs_set_##name(struct extent_buffer *eb, \
1012 u##bits val) \
1013{ \
1014 type *p = kmap_atomic(eb->first_page, KM_USER0); \
1015 p->member = cpu_to_le##bits(val); \
1016 kunmap_atomic(p, KM_USER0); \
1017}
1018
1019#define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits) \
1020static inline u##bits btrfs_##name(type *s) \
1021{ \
1022 return le##bits##_to_cpu(s->member); \
1023} \
1024static inline void btrfs_set_##name(type *s, u##bits val) \
1025{ \
1026 s->member = cpu_to_le##bits(val); \
1027}
1028
1029BTRFS_SETGET_FUNCS(device_type, struct btrfs_dev_item, type, 64);
1030BTRFS_SETGET_FUNCS(device_total_bytes, struct btrfs_dev_item, total_bytes, 64);
1031BTRFS_SETGET_FUNCS(device_bytes_used, struct btrfs_dev_item, bytes_used, 64);
1032BTRFS_SETGET_FUNCS(device_io_align, struct btrfs_dev_item, io_align, 32);
1033BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32);
1034BTRFS_SETGET_FUNCS(device_start_offset, struct btrfs_dev_item,
1035 start_offset, 64);
1036BTRFS_SETGET_FUNCS(device_sector_size, struct btrfs_dev_item, sector_size, 32);
1037BTRFS_SETGET_FUNCS(device_id, struct btrfs_dev_item, devid, 64);
1038BTRFS_SETGET_FUNCS(device_group, struct btrfs_dev_item, dev_group, 32);
1039BTRFS_SETGET_FUNCS(device_seek_speed, struct btrfs_dev_item, seek_speed, 8);
1040BTRFS_SETGET_FUNCS(device_bandwidth, struct btrfs_dev_item, bandwidth, 8);
1041BTRFS_SETGET_FUNCS(device_generation, struct btrfs_dev_item, generation, 64);
1042
1043BTRFS_SETGET_STACK_FUNCS(stack_device_type, struct btrfs_dev_item, type, 64);
1044BTRFS_SETGET_STACK_FUNCS(stack_device_total_bytes, struct btrfs_dev_item,
1045 total_bytes, 64);
1046BTRFS_SETGET_STACK_FUNCS(stack_device_bytes_used, struct btrfs_dev_item,
1047 bytes_used, 64);
1048BTRFS_SETGET_STACK_FUNCS(stack_device_io_align, struct btrfs_dev_item,
1049 io_align, 32);
1050BTRFS_SETGET_STACK_FUNCS(stack_device_io_width, struct btrfs_dev_item,
1051 io_width, 32);
1052BTRFS_SETGET_STACK_FUNCS(stack_device_sector_size, struct btrfs_dev_item,
1053 sector_size, 32);
1054BTRFS_SETGET_STACK_FUNCS(stack_device_id, struct btrfs_dev_item, devid, 64);
1055BTRFS_SETGET_STACK_FUNCS(stack_device_group, struct btrfs_dev_item,
1056 dev_group, 32);
1057BTRFS_SETGET_STACK_FUNCS(stack_device_seek_speed, struct btrfs_dev_item,
1058 seek_speed, 8);
1059BTRFS_SETGET_STACK_FUNCS(stack_device_bandwidth, struct btrfs_dev_item,
1060 bandwidth, 8);
1061BTRFS_SETGET_STACK_FUNCS(stack_device_generation, struct btrfs_dev_item,
1062 generation, 64);
1063
1064static inline char *btrfs_device_uuid(struct btrfs_dev_item *d)
1065{
1066 return (char *)d + offsetof(struct btrfs_dev_item, uuid);
1067}
1068
1069static inline char *btrfs_device_fsid(struct btrfs_dev_item *d)
1070{
1071 return (char *)d + offsetof(struct btrfs_dev_item, fsid);
1072}
1073
1074BTRFS_SETGET_FUNCS(chunk_length, struct btrfs_chunk, length, 64);
1075BTRFS_SETGET_FUNCS(chunk_owner, struct btrfs_chunk, owner, 64);
1076BTRFS_SETGET_FUNCS(chunk_stripe_len, struct btrfs_chunk, stripe_len, 64);
1077BTRFS_SETGET_FUNCS(chunk_io_align, struct btrfs_chunk, io_align, 32);
1078BTRFS_SETGET_FUNCS(chunk_io_width, struct btrfs_chunk, io_width, 32);
1079BTRFS_SETGET_FUNCS(chunk_sector_size, struct btrfs_chunk, sector_size, 32);
1080BTRFS_SETGET_FUNCS(chunk_type, struct btrfs_chunk, type, 64);
1081BTRFS_SETGET_FUNCS(chunk_num_stripes, struct btrfs_chunk, num_stripes, 16);
1082BTRFS_SETGET_FUNCS(chunk_sub_stripes, struct btrfs_chunk, sub_stripes, 16);
1083BTRFS_SETGET_FUNCS(stripe_devid, struct btrfs_stripe, devid, 64);
1084BTRFS_SETGET_FUNCS(stripe_offset, struct btrfs_stripe, offset, 64);
1085
1086static inline char *btrfs_stripe_dev_uuid(struct btrfs_stripe *s)
1087{
1088 return (char *)s + offsetof(struct btrfs_stripe, dev_uuid);
1089}
1090
1091BTRFS_SETGET_STACK_FUNCS(stack_chunk_length, struct btrfs_chunk, length, 64);
1092BTRFS_SETGET_STACK_FUNCS(stack_chunk_owner, struct btrfs_chunk, owner, 64);
1093BTRFS_SETGET_STACK_FUNCS(stack_chunk_stripe_len, struct btrfs_chunk,
1094 stripe_len, 64);
1095BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_align, struct btrfs_chunk,
1096 io_align, 32);
1097BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_width, struct btrfs_chunk,
1098 io_width, 32);
1099BTRFS_SETGET_STACK_FUNCS(stack_chunk_sector_size, struct btrfs_chunk,
1100 sector_size, 32);
1101BTRFS_SETGET_STACK_FUNCS(stack_chunk_type, struct btrfs_chunk, type, 64);
1102BTRFS_SETGET_STACK_FUNCS(stack_chunk_num_stripes, struct btrfs_chunk,
1103 num_stripes, 16);
1104BTRFS_SETGET_STACK_FUNCS(stack_chunk_sub_stripes, struct btrfs_chunk,
1105 sub_stripes, 16);
1106BTRFS_SETGET_STACK_FUNCS(stack_stripe_devid, struct btrfs_stripe, devid, 64);
1107BTRFS_SETGET_STACK_FUNCS(stack_stripe_offset, struct btrfs_stripe, offset, 64);
1108
1109static inline struct btrfs_stripe *btrfs_stripe_nr(struct btrfs_chunk *c,
1110 int nr)
1111{
1112 unsigned long offset = (unsigned long)c;
1113 offset += offsetof(struct btrfs_chunk, stripe);
1114 offset += nr * sizeof(struct btrfs_stripe);
1115 return (struct btrfs_stripe *)offset;
1116}
1117
1118static inline char *btrfs_stripe_dev_uuid_nr(struct btrfs_chunk *c, int nr)
1119{
1120 return btrfs_stripe_dev_uuid(btrfs_stripe_nr(c, nr));
1121}
1122
1123static inline u64 btrfs_stripe_offset_nr(struct extent_buffer *eb,
1124 struct btrfs_chunk *c, int nr)
1125{
1126 return btrfs_stripe_offset(eb, btrfs_stripe_nr(c, nr));
1127}
1128
1129static inline void btrfs_set_stripe_offset_nr(struct extent_buffer *eb,
1130 struct btrfs_chunk *c, int nr,
1131 u64 val)
1132{
1133 btrfs_set_stripe_offset(eb, btrfs_stripe_nr(c, nr), val);
1134}
1135
1136static inline u64 btrfs_stripe_devid_nr(struct extent_buffer *eb,
1137 struct btrfs_chunk *c, int nr)
1138{
1139 return btrfs_stripe_devid(eb, btrfs_stripe_nr(c, nr));
1140}
1141
1142static inline void btrfs_set_stripe_devid_nr(struct extent_buffer *eb,
1143 struct btrfs_chunk *c, int nr,
1144 u64 val)
1145{
1146 btrfs_set_stripe_devid(eb, btrfs_stripe_nr(c, nr), val);
1147}
1148
1149/* struct btrfs_block_group_item */
1150BTRFS_SETGET_STACK_FUNCS(block_group_used, struct btrfs_block_group_item,
1151 used, 64);
1152BTRFS_SETGET_FUNCS(disk_block_group_used, struct btrfs_block_group_item,
1153 used, 64);
1154BTRFS_SETGET_STACK_FUNCS(block_group_chunk_objectid,
1155 struct btrfs_block_group_item, chunk_objectid, 64);
1156
1157BTRFS_SETGET_FUNCS(disk_block_group_chunk_objectid,
1158 struct btrfs_block_group_item, chunk_objectid, 64);
1159BTRFS_SETGET_FUNCS(disk_block_group_flags,
1160 struct btrfs_block_group_item, flags, 64);
1161BTRFS_SETGET_STACK_FUNCS(block_group_flags,
1162 struct btrfs_block_group_item, flags, 64);
1163
1164/* struct btrfs_inode_ref */
1165BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16);
1166BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64);
1167
1168/* struct btrfs_inode_item */
1169BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64);
1170BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64);
1171BTRFS_SETGET_FUNCS(inode_transid, struct btrfs_inode_item, transid, 64);
1172BTRFS_SETGET_FUNCS(inode_size, struct btrfs_inode_item, size, 64);
1173BTRFS_SETGET_FUNCS(inode_nbytes, struct btrfs_inode_item, nbytes, 64);
1174BTRFS_SETGET_FUNCS(inode_block_group, struct btrfs_inode_item, block_group, 64);
1175BTRFS_SETGET_FUNCS(inode_nlink, struct btrfs_inode_item, nlink, 32);
1176BTRFS_SETGET_FUNCS(inode_uid, struct btrfs_inode_item, uid, 32);
1177BTRFS_SETGET_FUNCS(inode_gid, struct btrfs_inode_item, gid, 32);
1178BTRFS_SETGET_FUNCS(inode_mode, struct btrfs_inode_item, mode, 32);
1179BTRFS_SETGET_FUNCS(inode_rdev, struct btrfs_inode_item, rdev, 64);
1180BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, flags, 64);
1181
1182static inline struct btrfs_timespec *
1183btrfs_inode_atime(struct btrfs_inode_item *inode_item)
1184{
1185 unsigned long ptr = (unsigned long)inode_item;
1186 ptr += offsetof(struct btrfs_inode_item, atime);
1187 return (struct btrfs_timespec *)ptr;
1188}
1189
1190static inline struct btrfs_timespec *
1191btrfs_inode_mtime(struct btrfs_inode_item *inode_item)
1192{
1193 unsigned long ptr = (unsigned long)inode_item;
1194 ptr += offsetof(struct btrfs_inode_item, mtime);
1195 return (struct btrfs_timespec *)ptr;
1196}
1197
1198static inline struct btrfs_timespec *
1199btrfs_inode_ctime(struct btrfs_inode_item *inode_item)
1200{
1201 unsigned long ptr = (unsigned long)inode_item;
1202 ptr += offsetof(struct btrfs_inode_item, ctime);
1203 return (struct btrfs_timespec *)ptr;
1204}
1205
1206static inline struct btrfs_timespec *
1207btrfs_inode_otime(struct btrfs_inode_item *inode_item)
1208{
1209 unsigned long ptr = (unsigned long)inode_item;
1210 ptr += offsetof(struct btrfs_inode_item, otime);
1211 return (struct btrfs_timespec *)ptr;
1212}
1213
1214BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64);
1215BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
1216
1217/* struct btrfs_dev_extent */
1218BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent,
1219 chunk_tree, 64);
1220BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent,
1221 chunk_objectid, 64);
1222BTRFS_SETGET_FUNCS(dev_extent_chunk_offset, struct btrfs_dev_extent,
1223 chunk_offset, 64);
1224BTRFS_SETGET_FUNCS(dev_extent_length, struct btrfs_dev_extent, length, 64);
1225
1226static inline u8 *btrfs_dev_extent_chunk_tree_uuid(struct btrfs_dev_extent *dev)
1227{
1228 unsigned long ptr = offsetof(struct btrfs_dev_extent, chunk_tree_uuid);
1229 return (u8 *)((unsigned long)dev + ptr);
1230}
1231
1232/* struct btrfs_extent_ref */
1233BTRFS_SETGET_FUNCS(ref_root, struct btrfs_extent_ref, root, 64);
1234BTRFS_SETGET_FUNCS(ref_generation, struct btrfs_extent_ref, generation, 64);
1235BTRFS_SETGET_FUNCS(ref_objectid, struct btrfs_extent_ref, objectid, 64);
1236BTRFS_SETGET_FUNCS(ref_num_refs, struct btrfs_extent_ref, num_refs, 32);
1237
1238BTRFS_SETGET_STACK_FUNCS(stack_ref_root, struct btrfs_extent_ref, root, 64);
1239BTRFS_SETGET_STACK_FUNCS(stack_ref_generation, struct btrfs_extent_ref,
1240 generation, 64);
1241BTRFS_SETGET_STACK_FUNCS(stack_ref_objectid, struct btrfs_extent_ref,
1242 objectid, 64);
1243BTRFS_SETGET_STACK_FUNCS(stack_ref_num_refs, struct btrfs_extent_ref,
1244 num_refs, 32);
1245
1246/* struct btrfs_extent_item */
1247BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 32);
1248BTRFS_SETGET_STACK_FUNCS(stack_extent_refs, struct btrfs_extent_item,
1249 refs, 32);
1250
1251/* struct btrfs_node */
1252BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64);
1253BTRFS_SETGET_FUNCS(key_generation, struct btrfs_key_ptr, generation, 64);
1254
1255static inline u64 btrfs_node_blockptr(struct extent_buffer *eb, int nr)
1256{
1257 unsigned long ptr;
1258 ptr = offsetof(struct btrfs_node, ptrs) +
1259 sizeof(struct btrfs_key_ptr) * nr;
1260 return btrfs_key_blockptr(eb, (struct btrfs_key_ptr *)ptr);
1261}
1262
1263static inline void btrfs_set_node_blockptr(struct extent_buffer *eb,
1264 int nr, u64 val)
1265{
1266 unsigned long ptr;
1267 ptr = offsetof(struct btrfs_node, ptrs) +
1268 sizeof(struct btrfs_key_ptr) * nr;
1269 btrfs_set_key_blockptr(eb, (struct btrfs_key_ptr *)ptr, val);
1270}
1271
1272static inline u64 btrfs_node_ptr_generation(struct extent_buffer *eb, int nr)
1273{
1274 unsigned long ptr;
1275 ptr = offsetof(struct btrfs_node, ptrs) +
1276 sizeof(struct btrfs_key_ptr) * nr;
1277 return btrfs_key_generation(eb, (struct btrfs_key_ptr *)ptr);
1278}
1279
1280static inline void btrfs_set_node_ptr_generation(struct extent_buffer *eb,
1281 int nr, u64 val)
1282{
1283 unsigned long ptr;
1284 ptr = offsetof(struct btrfs_node, ptrs) +
1285 sizeof(struct btrfs_key_ptr) * nr;
1286 btrfs_set_key_generation(eb, (struct btrfs_key_ptr *)ptr, val);
1287}
1288
1289static inline unsigned long btrfs_node_key_ptr_offset(int nr)
1290{
1291 return offsetof(struct btrfs_node, ptrs) +
1292 sizeof(struct btrfs_key_ptr) * nr;
1293}
1294
1295void btrfs_node_key(struct extent_buffer *eb,
1296 struct btrfs_disk_key *disk_key, int nr);
1297
1298static inline void btrfs_set_node_key(struct extent_buffer *eb,
1299 struct btrfs_disk_key *disk_key, int nr)
1300{
1301 unsigned long ptr;
1302 ptr = btrfs_node_key_ptr_offset(nr);
1303 write_eb_member(eb, (struct btrfs_key_ptr *)ptr,
1304 struct btrfs_key_ptr, key, disk_key);
1305}
1306
1307/* struct btrfs_item */
1308BTRFS_SETGET_FUNCS(item_offset, struct btrfs_item, offset, 32);
1309BTRFS_SETGET_FUNCS(item_size, struct btrfs_item, size, 32);
1310
1311static inline unsigned long btrfs_item_nr_offset(int nr)
1312{
1313 return offsetof(struct btrfs_leaf, items) +
1314 sizeof(struct btrfs_item) * nr;
1315}
1316
1317static inline struct btrfs_item *btrfs_item_nr(struct extent_buffer *eb,
1318 int nr)
1319{
1320 return (struct btrfs_item *)btrfs_item_nr_offset(nr);
1321}
1322
1323static inline u32 btrfs_item_end(struct extent_buffer *eb,
1324 struct btrfs_item *item)
1325{
1326 return btrfs_item_offset(eb, item) + btrfs_item_size(eb, item);
1327}
1328
1329static inline u32 btrfs_item_end_nr(struct extent_buffer *eb, int nr)
1330{
1331 return btrfs_item_end(eb, btrfs_item_nr(eb, nr));
1332}
1333
1334static inline u32 btrfs_item_offset_nr(struct extent_buffer *eb, int nr)
1335{
1336 return btrfs_item_offset(eb, btrfs_item_nr(eb, nr));
1337}
1338
1339static inline u32 btrfs_item_size_nr(struct extent_buffer *eb, int nr)
1340{
1341 return btrfs_item_size(eb, btrfs_item_nr(eb, nr));
1342}
1343
1344static inline void btrfs_item_key(struct extent_buffer *eb,
1345 struct btrfs_disk_key *disk_key, int nr)
1346{
1347 struct btrfs_item *item = btrfs_item_nr(eb, nr);
1348 read_eb_member(eb, item, struct btrfs_item, key, disk_key);
1349}
1350
1351static inline void btrfs_set_item_key(struct extent_buffer *eb,
1352 struct btrfs_disk_key *disk_key, int nr)
1353{
1354 struct btrfs_item *item = btrfs_item_nr(eb, nr);
1355 write_eb_member(eb, item, struct btrfs_item, key, disk_key);
1356}
1357
1358BTRFS_SETGET_FUNCS(dir_log_end, struct btrfs_dir_log_item, end, 64);
1359
1360/*
1361 * struct btrfs_root_ref
1362 */
1363BTRFS_SETGET_FUNCS(root_ref_dirid, struct btrfs_root_ref, dirid, 64);
1364BTRFS_SETGET_FUNCS(root_ref_sequence, struct btrfs_root_ref, sequence, 64);
1365BTRFS_SETGET_FUNCS(root_ref_name_len, struct btrfs_root_ref, name_len, 16);
1366
1367/* struct btrfs_dir_item */
1368BTRFS_SETGET_FUNCS(dir_data_len, struct btrfs_dir_item, data_len, 16);
1369BTRFS_SETGET_FUNCS(dir_type, struct btrfs_dir_item, type, 8);
1370BTRFS_SETGET_FUNCS(dir_name_len, struct btrfs_dir_item, name_len, 16);
1371BTRFS_SETGET_FUNCS(dir_transid, struct btrfs_dir_item, transid, 64);
1372
1373static inline void btrfs_dir_item_key(struct extent_buffer *eb,
1374 struct btrfs_dir_item *item,
1375 struct btrfs_disk_key *key)
1376{
1377 read_eb_member(eb, item, struct btrfs_dir_item, location, key);
1378}
1379
1380static inline void btrfs_set_dir_item_key(struct extent_buffer *eb,
1381 struct btrfs_dir_item *item,
1382 struct btrfs_disk_key *key)
1383{
1384 write_eb_member(eb, item, struct btrfs_dir_item, location, key);
1385}
1386
1387/* struct btrfs_disk_key */
1388BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key,
1389 objectid, 64);
1390BTRFS_SETGET_STACK_FUNCS(disk_key_offset, struct btrfs_disk_key, offset, 64);
1391BTRFS_SETGET_STACK_FUNCS(disk_key_type, struct btrfs_disk_key, type, 8);
1392
1393static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu,
1394 struct btrfs_disk_key *disk)
1395{
1396 cpu->offset = le64_to_cpu(disk->offset);
1397 cpu->type = disk->type;
1398 cpu->objectid = le64_to_cpu(disk->objectid);
1399}
1400
1401static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk,
1402 struct btrfs_key *cpu)
1403{
1404 disk->offset = cpu_to_le64(cpu->offset);
1405 disk->type = cpu->type;
1406 disk->objectid = cpu_to_le64(cpu->objectid);
1407}
1408
1409static inline void btrfs_node_key_to_cpu(struct extent_buffer *eb,
1410 struct btrfs_key *key, int nr)
1411{
1412 struct btrfs_disk_key disk_key;
1413 btrfs_node_key(eb, &disk_key, nr);
1414 btrfs_disk_key_to_cpu(key, &disk_key);
1415}
1416
1417static inline void btrfs_item_key_to_cpu(struct extent_buffer *eb,
1418 struct btrfs_key *key, int nr)
1419{
1420 struct btrfs_disk_key disk_key;
1421 btrfs_item_key(eb, &disk_key, nr);
1422 btrfs_disk_key_to_cpu(key, &disk_key);
1423}
1424
1425static inline void btrfs_dir_item_key_to_cpu(struct extent_buffer *eb,
1426 struct btrfs_dir_item *item,
1427 struct btrfs_key *key)
1428{
1429 struct btrfs_disk_key disk_key;
1430 btrfs_dir_item_key(eb, item, &disk_key);
1431 btrfs_disk_key_to_cpu(key, &disk_key);
1432}
1433
1434
1435static inline u8 btrfs_key_type(struct btrfs_key *key)
1436{
1437 return key->type;
1438}
1439
1440static inline void btrfs_set_key_type(struct btrfs_key *key, u8 val)
1441{
1442 key->type = val;
1443}
1444
1445/* struct btrfs_header */
1446BTRFS_SETGET_HEADER_FUNCS(header_bytenr, struct btrfs_header, bytenr, 64);
1447BTRFS_SETGET_HEADER_FUNCS(header_generation, struct btrfs_header,
1448 generation, 64);
1449BTRFS_SETGET_HEADER_FUNCS(header_owner, struct btrfs_header, owner, 64);
1450BTRFS_SETGET_HEADER_FUNCS(header_nritems, struct btrfs_header, nritems, 32);
1451BTRFS_SETGET_HEADER_FUNCS(header_flags, struct btrfs_header, flags, 64);
1452BTRFS_SETGET_HEADER_FUNCS(header_level, struct btrfs_header, level, 8);
1453
1454static inline int btrfs_header_flag(struct extent_buffer *eb, u64 flag)
1455{
1456 return (btrfs_header_flags(eb) & flag) == flag;
1457}
1458
1459static inline int btrfs_set_header_flag(struct extent_buffer *eb, u64 flag)
1460{
1461 u64 flags = btrfs_header_flags(eb);
1462 btrfs_set_header_flags(eb, flags | flag);
1463 return (flags & flag) == flag;
1464}
1465
1466static inline int btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag)
1467{
1468 u64 flags = btrfs_header_flags(eb);
1469 btrfs_set_header_flags(eb, flags & ~flag);
1470 return (flags & flag) == flag;
1471}
1472
1473static inline u8 *btrfs_header_fsid(struct extent_buffer *eb)
1474{
1475 unsigned long ptr = offsetof(struct btrfs_header, fsid);
1476 return (u8 *)ptr;
1477}
1478
1479static inline u8 *btrfs_header_chunk_tree_uuid(struct extent_buffer *eb)
1480{
1481 unsigned long ptr = offsetof(struct btrfs_header, chunk_tree_uuid);
1482 return (u8 *)ptr;
1483}
1484
1485static inline u8 *btrfs_super_fsid(struct extent_buffer *eb)
1486{
1487 unsigned long ptr = offsetof(struct btrfs_super_block, fsid);
1488 return (u8 *)ptr;
1489}
1490
1491static inline u8 *btrfs_header_csum(struct extent_buffer *eb)
1492{
1493 unsigned long ptr = offsetof(struct btrfs_header, csum);
1494 return (u8 *)ptr;
1495}
1496
1497static inline struct btrfs_node *btrfs_buffer_node(struct extent_buffer *eb)
1498{
1499 return NULL;
1500}
1501
1502static inline struct btrfs_leaf *btrfs_buffer_leaf(struct extent_buffer *eb)
1503{
1504 return NULL;
1505}
1506
1507static inline struct btrfs_header *btrfs_buffer_header(struct extent_buffer *eb)
1508{
1509 return NULL;
1510}
1511
1512static inline int btrfs_is_leaf(struct extent_buffer *eb)
1513{
1514 return btrfs_header_level(eb) == 0;
1515}
1516
1517/* struct btrfs_root_item */
1518BTRFS_SETGET_FUNCS(disk_root_generation, struct btrfs_root_item,
1519 generation, 64);
1520BTRFS_SETGET_FUNCS(disk_root_refs, struct btrfs_root_item, refs, 32);
1521BTRFS_SETGET_FUNCS(disk_root_bytenr, struct btrfs_root_item, bytenr, 64);
1522BTRFS_SETGET_FUNCS(disk_root_level, struct btrfs_root_item, level, 8);
1523
1524BTRFS_SETGET_STACK_FUNCS(root_generation, struct btrfs_root_item,
1525 generation, 64);
1526BTRFS_SETGET_STACK_FUNCS(root_bytenr, struct btrfs_root_item, bytenr, 64);
1527BTRFS_SETGET_STACK_FUNCS(root_level, struct btrfs_root_item, level, 8);
1528BTRFS_SETGET_STACK_FUNCS(root_dirid, struct btrfs_root_item, root_dirid, 64);
1529BTRFS_SETGET_STACK_FUNCS(root_refs, struct btrfs_root_item, refs, 32);
1530BTRFS_SETGET_STACK_FUNCS(root_flags, struct btrfs_root_item, flags, 64);
1531BTRFS_SETGET_STACK_FUNCS(root_used, struct btrfs_root_item, bytes_used, 64);
1532BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64);
1533BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item,
1534 last_snapshot, 64);
1535
1536/* struct btrfs_super_block */
1537
1538BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
1539BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64);
1540BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block,
1541 generation, 64);
1542BTRFS_SETGET_STACK_FUNCS(super_root, struct btrfs_super_block, root, 64);
1543BTRFS_SETGET_STACK_FUNCS(super_sys_array_size,
1544 struct btrfs_super_block, sys_chunk_array_size, 32);
1545BTRFS_SETGET_STACK_FUNCS(super_chunk_root_generation,
1546 struct btrfs_super_block, chunk_root_generation, 64);
1547BTRFS_SETGET_STACK_FUNCS(super_root_level, struct btrfs_super_block,
1548 root_level, 8);
1549BTRFS_SETGET_STACK_FUNCS(super_chunk_root, struct btrfs_super_block,
1550 chunk_root, 64);
1551BTRFS_SETGET_STACK_FUNCS(super_chunk_root_level, struct btrfs_super_block,
1552 chunk_root_level, 8);
1553BTRFS_SETGET_STACK_FUNCS(super_log_root, struct btrfs_super_block,
1554 log_root, 64);
1555BTRFS_SETGET_STACK_FUNCS(super_log_root_transid, struct btrfs_super_block,
1556 log_root_transid, 64);
1557BTRFS_SETGET_STACK_FUNCS(super_log_root_level, struct btrfs_super_block,
1558 log_root_level, 8);
1559BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block,
1560 total_bytes, 64);
1561BTRFS_SETGET_STACK_FUNCS(super_bytes_used, struct btrfs_super_block,
1562 bytes_used, 64);
1563BTRFS_SETGET_STACK_FUNCS(super_sectorsize, struct btrfs_super_block,
1564 sectorsize, 32);
1565BTRFS_SETGET_STACK_FUNCS(super_nodesize, struct btrfs_super_block,
1566 nodesize, 32);
1567BTRFS_SETGET_STACK_FUNCS(super_leafsize, struct btrfs_super_block,
1568 leafsize, 32);
1569BTRFS_SETGET_STACK_FUNCS(super_stripesize, struct btrfs_super_block,
1570 stripesize, 32);
1571BTRFS_SETGET_STACK_FUNCS(super_root_dir, struct btrfs_super_block,
1572 root_dir_objectid, 64);
1573BTRFS_SETGET_STACK_FUNCS(super_num_devices, struct btrfs_super_block,
1574 num_devices, 64);
1575BTRFS_SETGET_STACK_FUNCS(super_compat_flags, struct btrfs_super_block,
1576 compat_flags, 64);
1577BTRFS_SETGET_STACK_FUNCS(super_compat_ro_flags, struct btrfs_super_block,
1578 compat_flags, 64);
1579BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block,
1580 incompat_flags, 64);
1581BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block,
1582 csum_type, 16);
1583
1584static inline int btrfs_super_csum_size(struct btrfs_super_block *s)
1585{
1586 int t = btrfs_super_csum_type(s);
1587 BUG_ON(t >= ARRAY_SIZE(btrfs_csum_sizes));
1588 return btrfs_csum_sizes[t];
1589}
1590
1591static inline unsigned long btrfs_leaf_data(struct extent_buffer *l)
1592{
1593 return offsetof(struct btrfs_leaf, items);
1594}
1595
1596/* struct btrfs_file_extent_item */
1597BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8);
1598
1599static inline unsigned long
1600btrfs_file_extent_inline_start(struct btrfs_file_extent_item *e)
1601{
1602 unsigned long offset = (unsigned long)e;
1603 offset += offsetof(struct btrfs_file_extent_item, disk_bytenr);
1604 return offset;
1605}
1606
1607static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize)
1608{
1609 return offsetof(struct btrfs_file_extent_item, disk_bytenr) + datasize;
1610}
1611
1612BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item,
1613 disk_bytenr, 64);
1614BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item,
1615 generation, 64);
1616BTRFS_SETGET_FUNCS(file_extent_disk_num_bytes, struct btrfs_file_extent_item,
1617 disk_num_bytes, 64);
1618BTRFS_SETGET_FUNCS(file_extent_offset, struct btrfs_file_extent_item,
1619 offset, 64);
1620BTRFS_SETGET_FUNCS(file_extent_num_bytes, struct btrfs_file_extent_item,
1621 num_bytes, 64);
1622BTRFS_SETGET_FUNCS(file_extent_ram_bytes, struct btrfs_file_extent_item,
1623 ram_bytes, 64);
1624BTRFS_SETGET_FUNCS(file_extent_compression, struct btrfs_file_extent_item,
1625 compression, 8);
1626BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item,
1627 encryption, 8);
1628BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item,
1629 other_encoding, 16);
1630
1631/* this returns the number of file bytes represented by the inline item.
1632 * If an item is compressed, this is the uncompressed size
1633 */
1634static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb,
1635 struct btrfs_file_extent_item *e)
1636{
1637 return btrfs_file_extent_ram_bytes(eb, e);
1638}
1639
1640/*
1641 * this returns the number of bytes used by the item on disk, minus the
1642 * size of any extent headers. If a file is compressed on disk, this is
1643 * the compressed size
1644 */
1645static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
1646 struct btrfs_item *e)
1647{
1648 unsigned long offset;
1649 offset = offsetof(struct btrfs_file_extent_item, disk_bytenr);
1650 return btrfs_item_size(eb, e) - offset;
1651}
1652
1653static inline struct btrfs_root *btrfs_sb(struct super_block *sb)
1654{
1655 return sb->s_fs_info;
1656}
1657
1658static inline int btrfs_set_root_name(struct btrfs_root *root,
1659 const char *name, int len)
1660{
1661 /* if we already have a name just free it */
1662 kfree(root->name);
1663
1664 root->name = kmalloc(len+1, GFP_KERNEL);
1665 if (!root->name)
1666 return -ENOMEM;
1667
1668 memcpy(root->name, name, len);
1669 root->name[len] = '\0';
1670
1671 return 0;
1672}
1673
1674static inline u32 btrfs_level_size(struct btrfs_root *root, int level)
1675{
1676 if (level == 0)
1677 return root->leafsize;
1678 return root->nodesize;
1679}
1680
1681/* helper function to cast into the data area of the leaf. */
1682#define btrfs_item_ptr(leaf, slot, type) \
1683 ((type *)(btrfs_leaf_data(leaf) + \
1684 btrfs_item_offset_nr(leaf, slot)))
1685
1686#define btrfs_item_ptr_offset(leaf, slot) \
1687 ((unsigned long)(btrfs_leaf_data(leaf) + \
1688 btrfs_item_offset_nr(leaf, slot)))
1689
1690static inline struct dentry *fdentry(struct file *file)
1691{
1692 return file->f_path.dentry;
1693}
1694
1695/* extent-tree.c */
1696int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
1697int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
1698 struct btrfs_root *root, u64 bytenr,
1699 u64 num_bytes, u32 *refs);
1700int btrfs_update_pinned_extents(struct btrfs_root *root,
1701 u64 bytenr, u64 num, int pin);
1702int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
1703 struct btrfs_root *root, struct extent_buffer *leaf);
1704int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
1705 struct btrfs_root *root, u64 objectid, u64 bytenr);
1706int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
1707 struct btrfs_root *root);
1708int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy);
1709struct btrfs_block_group_cache *btrfs_lookup_block_group(
1710 struct btrfs_fs_info *info,
1711 u64 bytenr);
1712u64 btrfs_find_block_group(struct btrfs_root *root,
1713 u64 search_start, u64 search_hint, int owner);
1714struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
1715 struct btrfs_root *root,
1716 u32 blocksize, u64 parent,
1717 u64 root_objectid,
1718 u64 ref_generation,
1719 int level,
1720 u64 hint,
1721 u64 empty_size);
1722struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
1723 struct btrfs_root *root,
1724 u64 bytenr, u32 blocksize);
1725int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
1726 struct btrfs_root *root,
1727 u64 num_bytes, u64 parent, u64 min_bytes,
1728 u64 root_objectid, u64 ref_generation,
1729 u64 owner, u64 empty_size, u64 hint_byte,
1730 u64 search_end, struct btrfs_key *ins, u64 data);
1731int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
1732 struct btrfs_root *root, u64 parent,
1733 u64 root_objectid, u64 ref_generation,
1734 u64 owner, struct btrfs_key *ins);
1735int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
1736 struct btrfs_root *root, u64 parent,
1737 u64 root_objectid, u64 ref_generation,
1738 u64 owner, struct btrfs_key *ins);
1739int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
1740 struct btrfs_root *root,
1741 u64 num_bytes, u64 min_alloc_size,
1742 u64 empty_size, u64 hint_byte,
1743 u64 search_end, struct btrfs_key *ins,
1744 u64 data);
1745int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1746 struct extent_buffer *orig_buf, struct extent_buffer *buf,
1747 u32 *nr_extents);
1748int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1749 struct extent_buffer *buf, u32 nr_extents);
1750int btrfs_update_ref(struct btrfs_trans_handle *trans,
1751 struct btrfs_root *root, struct extent_buffer *orig_buf,
1752 struct extent_buffer *buf, int start_slot, int nr);
1753int btrfs_free_extent(struct btrfs_trans_handle *trans,
1754 struct btrfs_root *root,
1755 u64 bytenr, u64 num_bytes, u64 parent,
1756 u64 root_objectid, u64 ref_generation,
1757 u64 owner_objectid, int pin);
1758int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
1759int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
1760 struct btrfs_root *root,
1761 struct extent_io_tree *unpin);
1762int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1763 struct btrfs_root *root,
1764 u64 bytenr, u64 num_bytes, u64 parent,
1765 u64 root_objectid, u64 ref_generation,
1766 u64 owner_objectid);
1767int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
1768 struct btrfs_root *root, u64 bytenr,
1769 u64 orig_parent, u64 parent,
1770 u64 root_objectid, u64 ref_generation,
1771 u64 owner_objectid);
1772int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
1773 struct btrfs_root *root);
1774int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr);
1775int btrfs_free_block_groups(struct btrfs_fs_info *info);
1776int btrfs_read_block_groups(struct btrfs_root *root);
1777int btrfs_make_block_group(struct btrfs_trans_handle *trans,
1778 struct btrfs_root *root, u64 bytes_used,
1779 u64 type, u64 chunk_objectid, u64 chunk_offset,
1780 u64 size);
1781int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
1782 struct btrfs_root *root, u64 group_start);
1783int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start);
1784int btrfs_free_reloc_root(struct btrfs_trans_handle *trans,
1785 struct btrfs_root *root);
1786int btrfs_drop_dead_reloc_roots(struct btrfs_root *root);
1787int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
1788 struct btrfs_root *root,
1789 struct extent_buffer *buf, u64 orig_start);
1790int btrfs_add_dead_reloc_root(struct btrfs_root *root);
1791int btrfs_cleanup_reloc_trees(struct btrfs_root *root);
1792int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
1793u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
1794/* ctree.c */
1795int btrfs_previous_item(struct btrfs_root *root,
1796 struct btrfs_path *path, u64 min_objectid,
1797 int type);
1798int btrfs_merge_path(struct btrfs_trans_handle *trans,
1799 struct btrfs_root *root,
1800 struct btrfs_key *node_keys,
1801 u64 *nodes, int lowest_level);
1802int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
1803 struct btrfs_root *root, struct btrfs_path *path,
1804 struct btrfs_key *new_key);
1805struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
1806struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
1807int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
1808 struct btrfs_key *key, int lowest_level,
1809 int cache_only, u64 min_trans);
1810int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
1811 struct btrfs_key *max_key,
1812 struct btrfs_path *path, int cache_only,
1813 u64 min_trans);
1814int btrfs_cow_block(struct btrfs_trans_handle *trans,
1815 struct btrfs_root *root, struct extent_buffer *buf,
1816 struct extent_buffer *parent, int parent_slot,
1817 struct extent_buffer **cow_ret, u64 prealloc_dest);
1818int btrfs_copy_root(struct btrfs_trans_handle *trans,
1819 struct btrfs_root *root,
1820 struct extent_buffer *buf,
1821 struct extent_buffer **cow_ret, u64 new_root_objectid);
1822int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root
1823 *root, struct btrfs_path *path, u32 data_size);
1824int btrfs_truncate_item(struct btrfs_trans_handle *trans,
1825 struct btrfs_root *root,
1826 struct btrfs_path *path,
1827 u32 new_size, int from_end);
1828int btrfs_split_item(struct btrfs_trans_handle *trans,
1829 struct btrfs_root *root,
1830 struct btrfs_path *path,
1831 struct btrfs_key *new_key,
1832 unsigned long split_offset);
1833int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
1834 *root, struct btrfs_key *key, struct btrfs_path *p, int
1835 ins_len, int cow);
1836int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1837 struct btrfs_root *root, struct extent_buffer *parent,
1838 int start_slot, int cache_only, u64 *last_ret,
1839 struct btrfs_key *progress);
1840void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p);
1841struct btrfs_path *btrfs_alloc_path(void);
1842void btrfs_free_path(struct btrfs_path *p);
1843void btrfs_init_path(struct btrfs_path *p);
1844int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1845 struct btrfs_path *path, int slot, int nr);
1846int btrfs_del_leaf(struct btrfs_trans_handle *trans,
1847 struct btrfs_root *root,
1848 struct btrfs_path *path, u64 bytenr);
1849static inline int btrfs_del_item(struct btrfs_trans_handle *trans,
1850 struct btrfs_root *root,
1851 struct btrfs_path *path)
1852{
1853 return btrfs_del_items(trans, root, path, path->slots[0], 1);
1854}
1855
1856int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
1857 *root, struct btrfs_key *key, void *data, u32 data_size);
1858int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
1859 struct btrfs_root *root,
1860 struct btrfs_path *path,
1861 struct btrfs_key *cpu_key, u32 *data_size,
1862 int nr);
1863int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
1864 struct btrfs_root *root,
1865 struct btrfs_path *path,
1866 struct btrfs_key *cpu_key, u32 *data_size, int nr);
1867
1868static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
1869 struct btrfs_root *root,
1870 struct btrfs_path *path,
1871 struct btrfs_key *key,
1872 u32 data_size)
1873{
1874 return btrfs_insert_empty_items(trans, root, path, key, &data_size, 1);
1875}
1876
1877int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
1878int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
1879int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
1880int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
1881 *root);
1882int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
1883 struct btrfs_root *root,
1884 struct extent_buffer *node,
1885 struct extent_buffer *parent);
1886/* root-item.c */
1887int btrfs_find_root_ref(struct btrfs_root *tree_root,
1888 struct btrfs_path *path,
1889 u64 root_id, u64 ref_id);
1890int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
1891 struct btrfs_root *tree_root,
1892 u64 root_id, u8 type, u64 ref_id,
1893 u64 dirid, u64 sequence,
1894 const char *name, int name_len);
1895int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1896 struct btrfs_key *key);
1897int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
1898 *root, struct btrfs_key *key, struct btrfs_root_item
1899 *item);
1900int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
1901 *root, struct btrfs_key *key, struct btrfs_root_item
1902 *item);
1903int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
1904 btrfs_root_item *item, struct btrfs_key *key);
1905int btrfs_search_root(struct btrfs_root *root, u64 search_start,
1906 u64 *found_objectid);
1907int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid,
1908 struct btrfs_root *latest_root);
1909/* dir-item.c */
1910int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
1911 struct btrfs_root *root, const char *name,
1912 int name_len, u64 dir,
1913 struct btrfs_key *location, u8 type, u64 index);
1914struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
1915 struct btrfs_root *root,
1916 struct btrfs_path *path, u64 dir,
1917 const char *name, int name_len,
1918 int mod);
1919struct btrfs_dir_item *
1920btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
1921 struct btrfs_root *root,
1922 struct btrfs_path *path, u64 dir,
1923 u64 objectid, const char *name, int name_len,
1924 int mod);
1925struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
1926 struct btrfs_path *path,
1927 const char *name, int name_len);
1928int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
1929 struct btrfs_root *root,
1930 struct btrfs_path *path,
1931 struct btrfs_dir_item *di);
1932int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
1933 struct btrfs_root *root, const char *name,
1934 u16 name_len, const void *data, u16 data_len,
1935 u64 dir);
1936struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
1937 struct btrfs_root *root,
1938 struct btrfs_path *path, u64 dir,
1939 const char *name, u16 name_len,
1940 int mod);
1941
1942/* orphan.c */
1943int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
1944 struct btrfs_root *root, u64 offset);
1945int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
1946 struct btrfs_root *root, u64 offset);
1947
1948/* inode-map.c */
1949int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
1950 struct btrfs_root *fs_root,
1951 u64 dirid, u64 *objectid);
1952int btrfs_find_highest_inode(struct btrfs_root *fs_root, u64 *objectid);
1953
1954/* inode-item.c */
1955int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
1956 struct btrfs_root *root,
1957 const char *name, int name_len,
1958 u64 inode_objectid, u64 ref_objectid, u64 index);
1959int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
1960 struct btrfs_root *root,
1961 const char *name, int name_len,
1962 u64 inode_objectid, u64 ref_objectid, u64 *index);
1963int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
1964 struct btrfs_root *root,
1965 struct btrfs_path *path, u64 objectid);
1966int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
1967 *root, struct btrfs_path *path,
1968 struct btrfs_key *location, int mod);
1969
1970/* file-item.c */
1971int btrfs_del_csums(struct btrfs_trans_handle *trans,
1972 struct btrfs_root *root, u64 bytenr, u64 len);
1973int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
1974 struct bio *bio, u32 *dst);
1975int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
1976 struct btrfs_root *root,
1977 u64 objectid, u64 pos,
1978 u64 disk_offset, u64 disk_num_bytes,
1979 u64 num_bytes, u64 offset, u64 ram_bytes,
1980 u8 compression, u8 encryption, u16 other_encoding);
1981int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
1982 struct btrfs_root *root,
1983 struct btrfs_path *path, u64 objectid,
1984 u64 bytenr, int mod);
1985int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
1986 struct btrfs_root *root,
1987 struct btrfs_ordered_sum *sums);
1988int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
1989 struct bio *bio, u64 file_start, int contig);
1990int btrfs_csum_file_bytes(struct btrfs_root *root, struct inode *inode,
1991 u64 start, unsigned long len);
1992struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
1993 struct btrfs_root *root,
1994 struct btrfs_path *path,
1995 u64 bytenr, int cow);
1996int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
1997 struct btrfs_root *root, struct btrfs_path *path,
1998 u64 isize);
1999int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start,
2000 u64 end, struct list_head *list);
2001/* inode.c */
2002
2003/* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */
2004#if defined(ClearPageFsMisc) && !defined(ClearPageChecked)
2005#define ClearPageChecked ClearPageFsMisc
2006#define SetPageChecked SetPageFsMisc
2007#define PageChecked PageFsMisc
2008#endif
2009
2010struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry);
2011int btrfs_set_inode_index(struct inode *dir, u64 *index);
2012int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
2013 struct btrfs_root *root,
2014 struct inode *dir, struct inode *inode,
2015 const char *name, int name_len);
2016int btrfs_add_link(struct btrfs_trans_handle *trans,
2017 struct inode *parent_inode, struct inode *inode,
2018 const char *name, int name_len, int add_backref, u64 index);
2019int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
2020 struct btrfs_root *root,
2021 struct inode *inode, u64 new_size,
2022 u32 min_type);
2023
2024int btrfs_start_delalloc_inodes(struct btrfs_root *root);
2025int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end);
2026int btrfs_writepages(struct address_space *mapping,
2027 struct writeback_control *wbc);
2028int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
2029 struct btrfs_root *new_root, struct dentry *dentry,
2030 u64 new_dirid, u64 alloc_hint);
2031int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
2032 size_t size, struct bio *bio, unsigned long bio_flags);
2033
2034unsigned long btrfs_force_ra(struct address_space *mapping,
2035 struct file_ra_state *ra, struct file *file,
2036 pgoff_t offset, pgoff_t last_index);
2037int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
2038 int for_del);
2039int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page);
2040int btrfs_readpage(struct file *file, struct page *page);
2041void btrfs_delete_inode(struct inode *inode);
2042void btrfs_put_inode(struct inode *inode);
2043void btrfs_read_locked_inode(struct inode *inode);
2044int btrfs_write_inode(struct inode *inode, int wait);
2045void btrfs_dirty_inode(struct inode *inode);
2046struct inode *btrfs_alloc_inode(struct super_block *sb);
2047void btrfs_destroy_inode(struct inode *inode);
2048int btrfs_init_cachep(void);
2049void btrfs_destroy_cachep(void);
2050long btrfs_ioctl_trans_end(struct file *file);
2051struct inode *btrfs_ilookup(struct super_block *s, u64 objectid,
2052 struct btrfs_root *root, int wait);
2053struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
2054 struct btrfs_root *root);
2055struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
2056 struct btrfs_root *root, int *is_new);
2057int btrfs_commit_write(struct file *file, struct page *page,
2058 unsigned from, unsigned to);
2059struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
2060 size_t page_offset, u64 start, u64 end,
2061 int create);
2062int btrfs_update_inode(struct btrfs_trans_handle *trans,
2063 struct btrfs_root *root,
2064 struct inode *inode);
2065int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
2066int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
2067void btrfs_orphan_cleanup(struct btrfs_root *root);
2068int btrfs_cont_expand(struct inode *inode, loff_t size);
2069
2070/* ioctl.c */
2071long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
2072
2073/* file.c */
2074int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync);
2075int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
2076 int skip_pinned);
2077int btrfs_check_file(struct btrfs_root *root, struct inode *inode);
2078extern struct file_operations btrfs_file_operations;
2079int btrfs_drop_extents(struct btrfs_trans_handle *trans,
2080 struct btrfs_root *root, struct inode *inode,
2081 u64 start, u64 end, u64 inline_limit, u64 *hint_block);
2082int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
2083 struct btrfs_root *root,
2084 struct inode *inode, u64 start, u64 end);
2085int btrfs_release_file(struct inode *inode, struct file *file);
2086
2087/* tree-defrag.c */
2088int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
2089 struct btrfs_root *root, int cache_only);
2090
2091/* sysfs.c */
2092int btrfs_init_sysfs(void);
2093void btrfs_exit_sysfs(void);
2094int btrfs_sysfs_add_super(struct btrfs_fs_info *fs);
2095int btrfs_sysfs_add_root(struct btrfs_root *root);
2096void btrfs_sysfs_del_root(struct btrfs_root *root);
2097void btrfs_sysfs_del_super(struct btrfs_fs_info *root);
2098
2099/* xattr.c */
2100ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
2101
2102/* super.c */
2103u64 btrfs_parse_size(char *str);
2104int btrfs_parse_options(struct btrfs_root *root, char *options);
2105int btrfs_sync_fs(struct super_block *sb, int wait);
2106
2107/* acl.c */
2108int btrfs_check_acl(struct inode *inode, int mask);
2109int btrfs_init_acl(struct inode *inode, struct inode *dir);
2110int btrfs_acl_chmod(struct inode *inode);
2111
2112/* free-space-cache.c */
2113int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
2114 u64 bytenr, u64 size);
2115int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group,
2116 u64 offset, u64 bytes);
2117int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
2118 u64 bytenr, u64 size);
2119int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group,
2120 u64 offset, u64 bytes);
2121void btrfs_remove_free_space_cache(struct btrfs_block_group_cache
2122 *block_group);
2123struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache
2124 *block_group, u64 offset,
2125 u64 bytes);
2126void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
2127 u64 bytes);
2128u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group);
2129#endif
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
new file mode 100644
index 000000000000..926a0b287a7d
--- /dev/null
+++ b/fs/btrfs/dir-item.c
@@ -0,0 +1,386 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include "ctree.h"
20#include "disk-io.h"
21#include "hash.h"
22#include "transaction.h"
23
24/*
25 * insert a name into a directory, doing overflow properly if there is a hash
26 * collision. data_size indicates how big the item inserted should be. On
27 * success a struct btrfs_dir_item pointer is returned, otherwise it is
28 * an ERR_PTR.
29 *
30 * The name is not copied into the dir item, you have to do that yourself.
31 */
32static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
33 *trans,
34 struct btrfs_root *root,
35 struct btrfs_path *path,
36 struct btrfs_key *cpu_key,
37 u32 data_size,
38 const char *name,
39 int name_len)
40{
41 int ret;
42 char *ptr;
43 struct btrfs_item *item;
44 struct extent_buffer *leaf;
45
46 ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
47 if (ret == -EEXIST) {
48 struct btrfs_dir_item *di;
49 di = btrfs_match_dir_item_name(root, path, name, name_len);
50 if (di)
51 return ERR_PTR(-EEXIST);
52 ret = btrfs_extend_item(trans, root, path, data_size);
53 WARN_ON(ret > 0);
54 }
55 if (ret < 0)
56 return ERR_PTR(ret);
57 WARN_ON(ret > 0);
58 leaf = path->nodes[0];
59 item = btrfs_item_nr(leaf, path->slots[0]);
60 ptr = btrfs_item_ptr(leaf, path->slots[0], char);
61 BUG_ON(data_size > btrfs_item_size(leaf, item));
62 ptr += btrfs_item_size(leaf, item) - data_size;
63 return (struct btrfs_dir_item *)ptr;
64}
65
66/*
67 * xattrs work a lot like directories, this inserts an xattr item
68 * into the tree
69 */
70int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
71 struct btrfs_root *root, const char *name,
72 u16 name_len, const void *data, u16 data_len,
73 u64 dir)
74{
75 int ret = 0;
76 struct btrfs_path *path;
77 struct btrfs_dir_item *dir_item;
78 unsigned long name_ptr, data_ptr;
79 struct btrfs_key key, location;
80 struct btrfs_disk_key disk_key;
81 struct extent_buffer *leaf;
82 u32 data_size;
83
84 key.objectid = dir;
85 btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
86 key.offset = btrfs_name_hash(name, name_len);
87 path = btrfs_alloc_path();
88 if (!path)
89 return -ENOMEM;
90 if (name_len + data_len + sizeof(struct btrfs_dir_item) >
91 BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item))
92 return -ENOSPC;
93
94 data_size = sizeof(*dir_item) + name_len + data_len;
95 dir_item = insert_with_overflow(trans, root, path, &key, data_size,
96 name, name_len);
97 /*
98 * FIXME: at some point we should handle xattr's that are larger than
99 * what we can fit in our leaf. We set location to NULL b/c we arent
100 * pointing at anything else, that will change if we store the xattr
101 * data in a separate inode.
102 */
103 BUG_ON(IS_ERR(dir_item));
104 memset(&location, 0, sizeof(location));
105
106 leaf = path->nodes[0];
107 btrfs_cpu_key_to_disk(&disk_key, &location);
108 btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
109 btrfs_set_dir_type(leaf, dir_item, BTRFS_FT_XATTR);
110 btrfs_set_dir_name_len(leaf, dir_item, name_len);
111 btrfs_set_dir_transid(leaf, dir_item, trans->transid);
112 btrfs_set_dir_data_len(leaf, dir_item, data_len);
113 name_ptr = (unsigned long)(dir_item + 1);
114 data_ptr = (unsigned long)((char *)name_ptr + name_len);
115
116 write_extent_buffer(leaf, name, name_ptr, name_len);
117 write_extent_buffer(leaf, data, data_ptr, data_len);
118 btrfs_mark_buffer_dirty(path->nodes[0]);
119
120 btrfs_free_path(path);
121 return ret;
122}
123
124/*
125 * insert a directory item in the tree, doing all the magic for
126 * both indexes. 'dir' indicates which objectid to insert it into,
127 * 'location' is the key to stuff into the directory item, 'type' is the
128 * type of the inode we're pointing to, and 'index' is the sequence number
129 * to use for the second index (if one is created).
130 */
131int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
132 *root, const char *name, int name_len, u64 dir,
133 struct btrfs_key *location, u8 type, u64 index)
134{
135 int ret = 0;
136 int ret2 = 0;
137 struct btrfs_path *path;
138 struct btrfs_dir_item *dir_item;
139 struct extent_buffer *leaf;
140 unsigned long name_ptr;
141 struct btrfs_key key;
142 struct btrfs_disk_key disk_key;
143 u32 data_size;
144
145 key.objectid = dir;
146 btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
147 key.offset = btrfs_name_hash(name, name_len);
148 path = btrfs_alloc_path();
149 data_size = sizeof(*dir_item) + name_len;
150 dir_item = insert_with_overflow(trans, root, path, &key, data_size,
151 name, name_len);
152 if (IS_ERR(dir_item)) {
153 ret = PTR_ERR(dir_item);
154 if (ret == -EEXIST)
155 goto second_insert;
156 goto out;
157 }
158
159 leaf = path->nodes[0];
160 btrfs_cpu_key_to_disk(&disk_key, location);
161 btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
162 btrfs_set_dir_type(leaf, dir_item, type);
163 btrfs_set_dir_data_len(leaf, dir_item, 0);
164 btrfs_set_dir_name_len(leaf, dir_item, name_len);
165 btrfs_set_dir_transid(leaf, dir_item, trans->transid);
166 name_ptr = (unsigned long)(dir_item + 1);
167
168 write_extent_buffer(leaf, name, name_ptr, name_len);
169 btrfs_mark_buffer_dirty(leaf);
170
171second_insert:
172 /* FIXME, use some real flag for selecting the extra index */
173 if (root == root->fs_info->tree_root) {
174 ret = 0;
175 goto out;
176 }
177 btrfs_release_path(root, path);
178
179 btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
180 key.offset = index;
181 dir_item = insert_with_overflow(trans, root, path, &key, data_size,
182 name, name_len);
183 if (IS_ERR(dir_item)) {
184 ret2 = PTR_ERR(dir_item);
185 goto out;
186 }
187 leaf = path->nodes[0];
188 btrfs_cpu_key_to_disk(&disk_key, location);
189 btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
190 btrfs_set_dir_type(leaf, dir_item, type);
191 btrfs_set_dir_data_len(leaf, dir_item, 0);
192 btrfs_set_dir_name_len(leaf, dir_item, name_len);
193 btrfs_set_dir_transid(leaf, dir_item, trans->transid);
194 name_ptr = (unsigned long)(dir_item + 1);
195 write_extent_buffer(leaf, name, name_ptr, name_len);
196 btrfs_mark_buffer_dirty(leaf);
197out:
198 btrfs_free_path(path);
199 if (ret)
200 return ret;
201 if (ret2)
202 return ret2;
203 return 0;
204}
205
206/*
207 * lookup a directory item based on name. 'dir' is the objectid
208 * we're searching in, and 'mod' tells us if you plan on deleting the
209 * item (use mod < 0) or changing the options (use mod > 0)
210 */
211struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
212 struct btrfs_root *root,
213 struct btrfs_path *path, u64 dir,
214 const char *name, int name_len,
215 int mod)
216{
217 int ret;
218 struct btrfs_key key;
219 int ins_len = mod < 0 ? -1 : 0;
220 int cow = mod != 0;
221 struct btrfs_key found_key;
222 struct extent_buffer *leaf;
223
224 key.objectid = dir;
225 btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
226
227 key.offset = btrfs_name_hash(name, name_len);
228
229 ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
230 if (ret < 0)
231 return ERR_PTR(ret);
232 if (ret > 0) {
233 if (path->slots[0] == 0)
234 return NULL;
235 path->slots[0]--;
236 }
237
238 leaf = path->nodes[0];
239 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
240
241 if (found_key.objectid != dir ||
242 btrfs_key_type(&found_key) != BTRFS_DIR_ITEM_KEY ||
243 found_key.offset != key.offset)
244 return NULL;
245
246 return btrfs_match_dir_item_name(root, path, name, name_len);
247}
248
249/*
250 * lookup a directory item based on index. 'dir' is the objectid
251 * we're searching in, and 'mod' tells us if you plan on deleting the
252 * item (use mod < 0) or changing the options (use mod > 0)
253 *
254 * The name is used to make sure the index really points to the name you were
255 * looking for.
256 */
257struct btrfs_dir_item *
258btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
259 struct btrfs_root *root,
260 struct btrfs_path *path, u64 dir,
261 u64 objectid, const char *name, int name_len,
262 int mod)
263{
264 int ret;
265 struct btrfs_key key;
266 int ins_len = mod < 0 ? -1 : 0;
267 int cow = mod != 0;
268
269 key.objectid = dir;
270 btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
271 key.offset = objectid;
272
273 ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
274 if (ret < 0)
275 return ERR_PTR(ret);
276 if (ret > 0)
277 return ERR_PTR(-ENOENT);
278 return btrfs_match_dir_item_name(root, path, name, name_len);
279}
280
281struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
282 struct btrfs_root *root,
283 struct btrfs_path *path, u64 dir,
284 const char *name, u16 name_len,
285 int mod)
286{
287 int ret;
288 struct btrfs_key key;
289 int ins_len = mod < 0 ? -1 : 0;
290 int cow = mod != 0;
291 struct btrfs_key found_key;
292 struct extent_buffer *leaf;
293
294 key.objectid = dir;
295 btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
296 key.offset = btrfs_name_hash(name, name_len);
297 ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
298 if (ret < 0)
299 return ERR_PTR(ret);
300 if (ret > 0) {
301 if (path->slots[0] == 0)
302 return NULL;
303 path->slots[0]--;
304 }
305
306 leaf = path->nodes[0];
307 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
308
309 if (found_key.objectid != dir ||
310 btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY ||
311 found_key.offset != key.offset)
312 return NULL;
313
314 return btrfs_match_dir_item_name(root, path, name, name_len);
315}
316
317/*
318 * helper function to look at the directory item pointed to by 'path'
319 * this walks through all the entries in a dir item and finds one
320 * for a specific name.
321 */
322struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
323 struct btrfs_path *path,
324 const char *name, int name_len)
325{
326 struct btrfs_dir_item *dir_item;
327 unsigned long name_ptr;
328 u32 total_len;
329 u32 cur = 0;
330 u32 this_len;
331 struct extent_buffer *leaf;
332
333 leaf = path->nodes[0];
334 dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
335 total_len = btrfs_item_size_nr(leaf, path->slots[0]);
336 while (cur < total_len) {
337 this_len = sizeof(*dir_item) +
338 btrfs_dir_name_len(leaf, dir_item) +
339 btrfs_dir_data_len(leaf, dir_item);
340 name_ptr = (unsigned long)(dir_item + 1);
341
342 if (btrfs_dir_name_len(leaf, dir_item) == name_len &&
343 memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0)
344 return dir_item;
345
346 cur += this_len;
347 dir_item = (struct btrfs_dir_item *)((char *)dir_item +
348 this_len);
349 }
350 return NULL;
351}
352
353/*
354 * given a pointer into a directory item, delete it. This
355 * handles items that have more than one entry in them.
356 */
357int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
358 struct btrfs_root *root,
359 struct btrfs_path *path,
360 struct btrfs_dir_item *di)
361{
362
363 struct extent_buffer *leaf;
364 u32 sub_item_len;
365 u32 item_len;
366 int ret = 0;
367
368 leaf = path->nodes[0];
369 sub_item_len = sizeof(*di) + btrfs_dir_name_len(leaf, di) +
370 btrfs_dir_data_len(leaf, di);
371 item_len = btrfs_item_size_nr(leaf, path->slots[0]);
372 if (sub_item_len == item_len) {
373 ret = btrfs_del_item(trans, root, path);
374 } else {
375 /* MARKER */
376 unsigned long ptr = (unsigned long)di;
377 unsigned long start;
378
379 start = btrfs_item_ptr_offset(leaf, path->slots[0]);
380 memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
381 item_len - (ptr + sub_item_len - start));
382 ret = btrfs_truncate_item(trans, root, path,
383 item_len - sub_item_len, 1);
384 }
385 return 0;
386}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
new file mode 100644
index 000000000000..81a313874ae5
--- /dev/null
+++ b/fs/btrfs/disk-io.c
@@ -0,0 +1,2343 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/version.h>
20#include <linux/fs.h>
21#include <linux/blkdev.h>
22#include <linux/scatterlist.h>
23#include <linux/swap.h>
24#include <linux/radix-tree.h>
25#include <linux/writeback.h>
26#include <linux/buffer_head.h>
27#include <linux/workqueue.h>
28#include <linux/kthread.h>
29#include <linux/freezer.h>
30#include "compat.h"
31#include "crc32c.h"
32#include "ctree.h"
33#include "disk-io.h"
34#include "transaction.h"
35#include "btrfs_inode.h"
36#include "volumes.h"
37#include "print-tree.h"
38#include "async-thread.h"
39#include "locking.h"
40#include "ref-cache.h"
41#include "tree-log.h"
42
43static struct extent_io_ops btree_extent_io_ops;
44static void end_workqueue_fn(struct btrfs_work *work);
45
46/*
47 * end_io_wq structs are used to do processing in task context when an IO is
48 * complete. This is used during reads to verify checksums, and it is used
49 * by writes to insert metadata for new file extents after IO is complete.
50 */
51struct end_io_wq {
52 struct bio *bio;
53 bio_end_io_t *end_io;
54 void *private;
55 struct btrfs_fs_info *info;
56 int error;
57 int metadata;
58 struct list_head list;
59 struct btrfs_work work;
60};
61
62/*
63 * async submit bios are used to offload expensive checksumming
64 * onto the worker threads. They checksum file and metadata bios
65 * just before they are sent down the IO stack.
66 */
67struct async_submit_bio {
68 struct inode *inode;
69 struct bio *bio;
70 struct list_head list;
71 extent_submit_bio_hook_t *submit_bio_start;
72 extent_submit_bio_hook_t *submit_bio_done;
73 int rw;
74 int mirror_num;
75 unsigned long bio_flags;
76 struct btrfs_work work;
77};
78
79/*
80 * extents on the btree inode are pretty simple, there's one extent
81 * that covers the entire device
82 */
83static struct extent_map *btree_get_extent(struct inode *inode,
84 struct page *page, size_t page_offset, u64 start, u64 len,
85 int create)
86{
87 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
88 struct extent_map *em;
89 int ret;
90
91 spin_lock(&em_tree->lock);
92 em = lookup_extent_mapping(em_tree, start, len);
93 if (em) {
94 em->bdev =
95 BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
96 spin_unlock(&em_tree->lock);
97 goto out;
98 }
99 spin_unlock(&em_tree->lock);
100
101 em = alloc_extent_map(GFP_NOFS);
102 if (!em) {
103 em = ERR_PTR(-ENOMEM);
104 goto out;
105 }
106 em->start = 0;
107 em->len = (u64)-1;
108 em->block_len = (u64)-1;
109 em->block_start = 0;
110 em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
111
112 spin_lock(&em_tree->lock);
113 ret = add_extent_mapping(em_tree, em);
114 if (ret == -EEXIST) {
115 u64 failed_start = em->start;
116 u64 failed_len = em->len;
117
118 free_extent_map(em);
119 em = lookup_extent_mapping(em_tree, start, len);
120 if (em) {
121 ret = 0;
122 } else {
123 em = lookup_extent_mapping(em_tree, failed_start,
124 failed_len);
125 ret = -EIO;
126 }
127 } else if (ret) {
128 free_extent_map(em);
129 em = NULL;
130 }
131 spin_unlock(&em_tree->lock);
132
133 if (ret)
134 em = ERR_PTR(ret);
135out:
136 return em;
137}
138
139u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len)
140{
141 return btrfs_crc32c(seed, data, len);
142}
143
144void btrfs_csum_final(u32 crc, char *result)
145{
146 *(__le32 *)result = ~cpu_to_le32(crc);
147}
148
149/*
150 * compute the csum for a btree block, and either verify it or write it
151 * into the csum field of the block.
152 */
153static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
154 int verify)
155{
156 u16 csum_size =
157 btrfs_super_csum_size(&root->fs_info->super_copy);
158 char *result = NULL;
159 unsigned long len;
160 unsigned long cur_len;
161 unsigned long offset = BTRFS_CSUM_SIZE;
162 char *map_token = NULL;
163 char *kaddr;
164 unsigned long map_start;
165 unsigned long map_len;
166 int err;
167 u32 crc = ~(u32)0;
168 unsigned long inline_result;
169
170 len = buf->len - offset;
171 while (len > 0) {
172 err = map_private_extent_buffer(buf, offset, 32,
173 &map_token, &kaddr,
174 &map_start, &map_len, KM_USER0);
175 if (err)
176 return 1;
177 cur_len = min(len, map_len - (offset - map_start));
178 crc = btrfs_csum_data(root, kaddr + offset - map_start,
179 crc, cur_len);
180 len -= cur_len;
181 offset += cur_len;
182 unmap_extent_buffer(buf, map_token, KM_USER0);
183 }
184 if (csum_size > sizeof(inline_result)) {
185 result = kzalloc(csum_size * sizeof(char), GFP_NOFS);
186 if (!result)
187 return 1;
188 } else {
189 result = (char *)&inline_result;
190 }
191
192 btrfs_csum_final(crc, result);
193
194 if (verify) {
195 if (memcmp_extent_buffer(buf, result, 0, csum_size)) {
196 u32 val;
197 u32 found = 0;
198 memcpy(&found, result, csum_size);
199
200 read_extent_buffer(buf, &val, 0, csum_size);
201 printk(KERN_INFO "btrfs: %s checksum verify failed "
202 "on %llu wanted %X found %X level %d\n",
203 root->fs_info->sb->s_id,
204 buf->start, val, found, btrfs_header_level(buf));
205 if (result != (char *)&inline_result)
206 kfree(result);
207 return 1;
208 }
209 } else {
210 write_extent_buffer(buf, result, 0, csum_size);
211 }
212 if (result != (char *)&inline_result)
213 kfree(result);
214 return 0;
215}
216
217/*
218 * we can't consider a given block up to date unless the transid of the
219 * block matches the transid in the parent node's pointer. This is how we
220 * detect blocks that either didn't get written at all or got written
221 * in the wrong place.
222 */
223static int verify_parent_transid(struct extent_io_tree *io_tree,
224 struct extent_buffer *eb, u64 parent_transid)
225{
226 int ret;
227
228 if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
229 return 0;
230
231 lock_extent(io_tree, eb->start, eb->start + eb->len - 1, GFP_NOFS);
232 if (extent_buffer_uptodate(io_tree, eb) &&
233 btrfs_header_generation(eb) == parent_transid) {
234 ret = 0;
235 goto out;
236 }
237 printk("parent transid verify failed on %llu wanted %llu found %llu\n",
238 (unsigned long long)eb->start,
239 (unsigned long long)parent_transid,
240 (unsigned long long)btrfs_header_generation(eb));
241 ret = 1;
242 clear_extent_buffer_uptodate(io_tree, eb);
243out:
244 unlock_extent(io_tree, eb->start, eb->start + eb->len - 1,
245 GFP_NOFS);
246 return ret;
247}
248
249/*
250 * helper to read a given tree block, doing retries as required when
251 * the checksums don't match and we have alternate mirrors to try.
252 */
253static int btree_read_extent_buffer_pages(struct btrfs_root *root,
254 struct extent_buffer *eb,
255 u64 start, u64 parent_transid)
256{
257 struct extent_io_tree *io_tree;
258 int ret;
259 int num_copies = 0;
260 int mirror_num = 0;
261
262 io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
263 while (1) {
264 ret = read_extent_buffer_pages(io_tree, eb, start, 1,
265 btree_get_extent, mirror_num);
266 if (!ret &&
267 !verify_parent_transid(io_tree, eb, parent_transid))
268 return ret;
269
270 num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
271 eb->start, eb->len);
272 if (num_copies == 1)
273 return ret;
274
275 mirror_num++;
276 if (mirror_num > num_copies)
277 return ret;
278 }
279 return -EIO;
280}
281
282/*
283 * checksum a dirty tree block before IO. This has extra checks to make sure
284 * we only fill in the checksum field in the first page of a multi-page block
285 */
286
287static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
288{
289 struct extent_io_tree *tree;
290 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
291 u64 found_start;
292 int found_level;
293 unsigned long len;
294 struct extent_buffer *eb;
295 int ret;
296
297 tree = &BTRFS_I(page->mapping->host)->io_tree;
298
299 if (page->private == EXTENT_PAGE_PRIVATE)
300 goto out;
301 if (!page->private)
302 goto out;
303 len = page->private >> 2;
304 WARN_ON(len == 0);
305
306 eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
307 ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE,
308 btrfs_header_generation(eb));
309 BUG_ON(ret);
310 found_start = btrfs_header_bytenr(eb);
311 if (found_start != start) {
312 WARN_ON(1);
313 goto err;
314 }
315 if (eb->first_page != page) {
316 WARN_ON(1);
317 goto err;
318 }
319 if (!PageUptodate(page)) {
320 WARN_ON(1);
321 goto err;
322 }
323 found_level = btrfs_header_level(eb);
324
325 csum_tree_block(root, eb, 0);
326err:
327 free_extent_buffer(eb);
328out:
329 return 0;
330}
331
332static int check_tree_block_fsid(struct btrfs_root *root,
333 struct extent_buffer *eb)
334{
335 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
336 u8 fsid[BTRFS_UUID_SIZE];
337 int ret = 1;
338
339 read_extent_buffer(eb, fsid, (unsigned long)btrfs_header_fsid(eb),
340 BTRFS_FSID_SIZE);
341 while (fs_devices) {
342 if (!memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE)) {
343 ret = 0;
344 break;
345 }
346 fs_devices = fs_devices->seed;
347 }
348 return ret;
349}
350
351static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
352 struct extent_state *state)
353{
354 struct extent_io_tree *tree;
355 u64 found_start;
356 int found_level;
357 unsigned long len;
358 struct extent_buffer *eb;
359 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
360 int ret = 0;
361
362 tree = &BTRFS_I(page->mapping->host)->io_tree;
363 if (page->private == EXTENT_PAGE_PRIVATE)
364 goto out;
365 if (!page->private)
366 goto out;
367
368 len = page->private >> 2;
369 WARN_ON(len == 0);
370
371 eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
372
373 found_start = btrfs_header_bytenr(eb);
374 if (found_start != start) {
375 printk(KERN_INFO "btrfs bad tree block start %llu %llu\n",
376 (unsigned long long)found_start,
377 (unsigned long long)eb->start);
378 ret = -EIO;
379 goto err;
380 }
381 if (eb->first_page != page) {
382 printk(KERN_INFO "btrfs bad first page %lu %lu\n",
383 eb->first_page->index, page->index);
384 WARN_ON(1);
385 ret = -EIO;
386 goto err;
387 }
388 if (check_tree_block_fsid(root, eb)) {
389 printk(KERN_INFO "btrfs bad fsid on block %llu\n",
390 (unsigned long long)eb->start);
391 ret = -EIO;
392 goto err;
393 }
394 found_level = btrfs_header_level(eb);
395
396 ret = csum_tree_block(root, eb, 1);
397 if (ret)
398 ret = -EIO;
399
400 end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
401 end = eb->start + end - 1;
402err:
403 free_extent_buffer(eb);
404out:
405 return ret;
406}
407
408static void end_workqueue_bio(struct bio *bio, int err)
409{
410 struct end_io_wq *end_io_wq = bio->bi_private;
411 struct btrfs_fs_info *fs_info;
412
413 fs_info = end_io_wq->info;
414 end_io_wq->error = err;
415 end_io_wq->work.func = end_workqueue_fn;
416 end_io_wq->work.flags = 0;
417
418 if (bio->bi_rw & (1 << BIO_RW)) {
419 if (end_io_wq->metadata)
420 btrfs_queue_worker(&fs_info->endio_meta_write_workers,
421 &end_io_wq->work);
422 else
423 btrfs_queue_worker(&fs_info->endio_write_workers,
424 &end_io_wq->work);
425 } else {
426 if (end_io_wq->metadata)
427 btrfs_queue_worker(&fs_info->endio_meta_workers,
428 &end_io_wq->work);
429 else
430 btrfs_queue_worker(&fs_info->endio_workers,
431 &end_io_wq->work);
432 }
433}
434
435int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
436 int metadata)
437{
438 struct end_io_wq *end_io_wq;
439 end_io_wq = kmalloc(sizeof(*end_io_wq), GFP_NOFS);
440 if (!end_io_wq)
441 return -ENOMEM;
442
443 end_io_wq->private = bio->bi_private;
444 end_io_wq->end_io = bio->bi_end_io;
445 end_io_wq->info = info;
446 end_io_wq->error = 0;
447 end_io_wq->bio = bio;
448 end_io_wq->metadata = metadata;
449
450 bio->bi_private = end_io_wq;
451 bio->bi_end_io = end_workqueue_bio;
452 return 0;
453}
454
455unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info)
456{
457 unsigned long limit = min_t(unsigned long,
458 info->workers.max_workers,
459 info->fs_devices->open_devices);
460 return 256 * limit;
461}
462
463int btrfs_congested_async(struct btrfs_fs_info *info, int iodone)
464{
465 return atomic_read(&info->nr_async_bios) >
466 btrfs_async_submit_limit(info);
467}
468
469static void run_one_async_start(struct btrfs_work *work)
470{
471 struct btrfs_fs_info *fs_info;
472 struct async_submit_bio *async;
473
474 async = container_of(work, struct async_submit_bio, work);
475 fs_info = BTRFS_I(async->inode)->root->fs_info;
476 async->submit_bio_start(async->inode, async->rw, async->bio,
477 async->mirror_num, async->bio_flags);
478}
479
480static void run_one_async_done(struct btrfs_work *work)
481{
482 struct btrfs_fs_info *fs_info;
483 struct async_submit_bio *async;
484 int limit;
485
486 async = container_of(work, struct async_submit_bio, work);
487 fs_info = BTRFS_I(async->inode)->root->fs_info;
488
489 limit = btrfs_async_submit_limit(fs_info);
490 limit = limit * 2 / 3;
491
492 atomic_dec(&fs_info->nr_async_submits);
493
494 if (atomic_read(&fs_info->nr_async_submits) < limit &&
495 waitqueue_active(&fs_info->async_submit_wait))
496 wake_up(&fs_info->async_submit_wait);
497
498 async->submit_bio_done(async->inode, async->rw, async->bio,
499 async->mirror_num, async->bio_flags);
500}
501
502static void run_one_async_free(struct btrfs_work *work)
503{
504 struct async_submit_bio *async;
505
506 async = container_of(work, struct async_submit_bio, work);
507 kfree(async);
508}
509
510int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
511 int rw, struct bio *bio, int mirror_num,
512 unsigned long bio_flags,
513 extent_submit_bio_hook_t *submit_bio_start,
514 extent_submit_bio_hook_t *submit_bio_done)
515{
516 struct async_submit_bio *async;
517
518 async = kmalloc(sizeof(*async), GFP_NOFS);
519 if (!async)
520 return -ENOMEM;
521
522 async->inode = inode;
523 async->rw = rw;
524 async->bio = bio;
525 async->mirror_num = mirror_num;
526 async->submit_bio_start = submit_bio_start;
527 async->submit_bio_done = submit_bio_done;
528
529 async->work.func = run_one_async_start;
530 async->work.ordered_func = run_one_async_done;
531 async->work.ordered_free = run_one_async_free;
532
533 async->work.flags = 0;
534 async->bio_flags = bio_flags;
535
536 atomic_inc(&fs_info->nr_async_submits);
537 btrfs_queue_worker(&fs_info->workers, &async->work);
538#if 0
539 int limit = btrfs_async_submit_limit(fs_info);
540 if (atomic_read(&fs_info->nr_async_submits) > limit) {
541 wait_event_timeout(fs_info->async_submit_wait,
542 (atomic_read(&fs_info->nr_async_submits) < limit),
543 HZ/10);
544
545 wait_event_timeout(fs_info->async_submit_wait,
546 (atomic_read(&fs_info->nr_async_bios) < limit),
547 HZ/10);
548 }
549#endif
550 while (atomic_read(&fs_info->async_submit_draining) &&
551 atomic_read(&fs_info->nr_async_submits)) {
552 wait_event(fs_info->async_submit_wait,
553 (atomic_read(&fs_info->nr_async_submits) == 0));
554 }
555
556 return 0;
557}
558
559static int btree_csum_one_bio(struct bio *bio)
560{
561 struct bio_vec *bvec = bio->bi_io_vec;
562 int bio_index = 0;
563 struct btrfs_root *root;
564
565 WARN_ON(bio->bi_vcnt <= 0);
566 while (bio_index < bio->bi_vcnt) {
567 root = BTRFS_I(bvec->bv_page->mapping->host)->root;
568 csum_dirty_buffer(root, bvec->bv_page);
569 bio_index++;
570 bvec++;
571 }
572 return 0;
573}
574
575static int __btree_submit_bio_start(struct inode *inode, int rw,
576 struct bio *bio, int mirror_num,
577 unsigned long bio_flags)
578{
579 /*
580 * when we're called for a write, we're already in the async
581 * submission context. Just jump into btrfs_map_bio
582 */
583 btree_csum_one_bio(bio);
584 return 0;
585}
586
587static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
588 int mirror_num, unsigned long bio_flags)
589{
590 /*
591 * when we're called for a write, we're already in the async
592 * submission context. Just jump into btrfs_map_bio
593 */
594 return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
595}
596
597static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
598 int mirror_num, unsigned long bio_flags)
599{
600 int ret;
601
602 ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
603 bio, 1);
604 BUG_ON(ret);
605
606 if (!(rw & (1 << BIO_RW))) {
607 /*
608 * called for a read, do the setup so that checksum validation
609 * can happen in the async kernel threads
610 */
611 return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
612 mirror_num, 0);
613 }
614 /*
615 * kthread helpers are used to submit writes so that checksumming
616 * can happen in parallel across all CPUs
617 */
618 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
619 inode, rw, bio, mirror_num, 0,
620 __btree_submit_bio_start,
621 __btree_submit_bio_done);
622}
623
624static int btree_writepage(struct page *page, struct writeback_control *wbc)
625{
626 struct extent_io_tree *tree;
627 tree = &BTRFS_I(page->mapping->host)->io_tree;
628
629 if (current->flags & PF_MEMALLOC) {
630 redirty_page_for_writepage(wbc, page);
631 unlock_page(page);
632 return 0;
633 }
634 return extent_write_full_page(tree, page, btree_get_extent, wbc);
635}
636
637static int btree_writepages(struct address_space *mapping,
638 struct writeback_control *wbc)
639{
640 struct extent_io_tree *tree;
641 tree = &BTRFS_I(mapping->host)->io_tree;
642 if (wbc->sync_mode == WB_SYNC_NONE) {
643 u64 num_dirty;
644 u64 start = 0;
645 unsigned long thresh = 32 * 1024 * 1024;
646
647 if (wbc->for_kupdate)
648 return 0;
649
650 num_dirty = count_range_bits(tree, &start, (u64)-1,
651 thresh, EXTENT_DIRTY);
652 if (num_dirty < thresh)
653 return 0;
654 }
655 return extent_writepages(tree, mapping, btree_get_extent, wbc);
656}
657
658static int btree_readpage(struct file *file, struct page *page)
659{
660 struct extent_io_tree *tree;
661 tree = &BTRFS_I(page->mapping->host)->io_tree;
662 return extent_read_full_page(tree, page, btree_get_extent);
663}
664
665static int btree_releasepage(struct page *page, gfp_t gfp_flags)
666{
667 struct extent_io_tree *tree;
668 struct extent_map_tree *map;
669 int ret;
670
671 if (PageWriteback(page) || PageDirty(page))
672 return 0;
673
674 tree = &BTRFS_I(page->mapping->host)->io_tree;
675 map = &BTRFS_I(page->mapping->host)->extent_tree;
676
677 ret = try_release_extent_state(map, tree, page, gfp_flags);
678 if (!ret)
679 return 0;
680
681 ret = try_release_extent_buffer(tree, page);
682 if (ret == 1) {
683 ClearPagePrivate(page);
684 set_page_private(page, 0);
685 page_cache_release(page);
686 }
687
688 return ret;
689}
690
691static void btree_invalidatepage(struct page *page, unsigned long offset)
692{
693 struct extent_io_tree *tree;
694 tree = &BTRFS_I(page->mapping->host)->io_tree;
695 extent_invalidatepage(tree, page, offset);
696 btree_releasepage(page, GFP_NOFS);
697 if (PagePrivate(page)) {
698 printk(KERN_WARNING "btrfs warning page private not zero "
699 "on page %llu\n", (unsigned long long)page_offset(page));
700 ClearPagePrivate(page);
701 set_page_private(page, 0);
702 page_cache_release(page);
703 }
704}
705
706#if 0
707static int btree_writepage(struct page *page, struct writeback_control *wbc)
708{
709 struct buffer_head *bh;
710 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
711 struct buffer_head *head;
712 if (!page_has_buffers(page)) {
713 create_empty_buffers(page, root->fs_info->sb->s_blocksize,
714 (1 << BH_Dirty)|(1 << BH_Uptodate));
715 }
716 head = page_buffers(page);
717 bh = head;
718 do {
719 if (buffer_dirty(bh))
720 csum_tree_block(root, bh, 0);
721 bh = bh->b_this_page;
722 } while (bh != head);
723 return block_write_full_page(page, btree_get_block, wbc);
724}
725#endif
726
727static struct address_space_operations btree_aops = {
728 .readpage = btree_readpage,
729 .writepage = btree_writepage,
730 .writepages = btree_writepages,
731 .releasepage = btree_releasepage,
732 .invalidatepage = btree_invalidatepage,
733 .sync_page = block_sync_page,
734};
735
736int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
737 u64 parent_transid)
738{
739 struct extent_buffer *buf = NULL;
740 struct inode *btree_inode = root->fs_info->btree_inode;
741 int ret = 0;
742
743 buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
744 if (!buf)
745 return 0;
746 read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
747 buf, 0, 0, btree_get_extent, 0);
748 free_extent_buffer(buf);
749 return ret;
750}
751
752struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
753 u64 bytenr, u32 blocksize)
754{
755 struct inode *btree_inode = root->fs_info->btree_inode;
756 struct extent_buffer *eb;
757 eb = find_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
758 bytenr, blocksize, GFP_NOFS);
759 return eb;
760}
761
762struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
763 u64 bytenr, u32 blocksize)
764{
765 struct inode *btree_inode = root->fs_info->btree_inode;
766 struct extent_buffer *eb;
767
768 eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
769 bytenr, blocksize, NULL, GFP_NOFS);
770 return eb;
771}
772
773
774int btrfs_write_tree_block(struct extent_buffer *buf)
775{
776 return btrfs_fdatawrite_range(buf->first_page->mapping, buf->start,
777 buf->start + buf->len - 1, WB_SYNC_ALL);
778}
779
780int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
781{
782 return btrfs_wait_on_page_writeback_range(buf->first_page->mapping,
783 buf->start, buf->start + buf->len - 1);
784}
785
786struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
787 u32 blocksize, u64 parent_transid)
788{
789 struct extent_buffer *buf = NULL;
790 struct inode *btree_inode = root->fs_info->btree_inode;
791 struct extent_io_tree *io_tree;
792 int ret;
793
794 io_tree = &BTRFS_I(btree_inode)->io_tree;
795
796 buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
797 if (!buf)
798 return NULL;
799
800 ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
801
802 if (ret == 0)
803 buf->flags |= EXTENT_UPTODATE;
804 else
805 WARN_ON(1);
806 return buf;
807
808}
809
810int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
811 struct extent_buffer *buf)
812{
813 struct inode *btree_inode = root->fs_info->btree_inode;
814 if (btrfs_header_generation(buf) ==
815 root->fs_info->running_transaction->transid) {
816 WARN_ON(!btrfs_tree_locked(buf));
817 clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
818 buf);
819 }
820 return 0;
821}
822
823static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
824 u32 stripesize, struct btrfs_root *root,
825 struct btrfs_fs_info *fs_info,
826 u64 objectid)
827{
828 root->node = NULL;
829 root->commit_root = NULL;
830 root->ref_tree = NULL;
831 root->sectorsize = sectorsize;
832 root->nodesize = nodesize;
833 root->leafsize = leafsize;
834 root->stripesize = stripesize;
835 root->ref_cows = 0;
836 root->track_dirty = 0;
837
838 root->fs_info = fs_info;
839 root->objectid = objectid;
840 root->last_trans = 0;
841 root->highest_inode = 0;
842 root->last_inode_alloc = 0;
843 root->name = NULL;
844 root->in_sysfs = 0;
845
846 INIT_LIST_HEAD(&root->dirty_list);
847 INIT_LIST_HEAD(&root->orphan_list);
848 INIT_LIST_HEAD(&root->dead_list);
849 spin_lock_init(&root->node_lock);
850 spin_lock_init(&root->list_lock);
851 mutex_init(&root->objectid_mutex);
852 mutex_init(&root->log_mutex);
853 extent_io_tree_init(&root->dirty_log_pages,
854 fs_info->btree_inode->i_mapping, GFP_NOFS);
855
856 btrfs_leaf_ref_tree_init(&root->ref_tree_struct);
857 root->ref_tree = &root->ref_tree_struct;
858
859 memset(&root->root_key, 0, sizeof(root->root_key));
860 memset(&root->root_item, 0, sizeof(root->root_item));
861 memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
862 memset(&root->root_kobj, 0, sizeof(root->root_kobj));
863 root->defrag_trans_start = fs_info->generation;
864 init_completion(&root->kobj_unregister);
865 root->defrag_running = 0;
866 root->defrag_level = 0;
867 root->root_key.objectid = objectid;
868 root->anon_super.s_root = NULL;
869 root->anon_super.s_dev = 0;
870 INIT_LIST_HEAD(&root->anon_super.s_list);
871 INIT_LIST_HEAD(&root->anon_super.s_instances);
872 init_rwsem(&root->anon_super.s_umount);
873
874 return 0;
875}
876
877static int find_and_setup_root(struct btrfs_root *tree_root,
878 struct btrfs_fs_info *fs_info,
879 u64 objectid,
880 struct btrfs_root *root)
881{
882 int ret;
883 u32 blocksize;
884 u64 generation;
885
886 __setup_root(tree_root->nodesize, tree_root->leafsize,
887 tree_root->sectorsize, tree_root->stripesize,
888 root, fs_info, objectid);
889 ret = btrfs_find_last_root(tree_root, objectid,
890 &root->root_item, &root->root_key);
891 BUG_ON(ret);
892
893 generation = btrfs_root_generation(&root->root_item);
894 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
895 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
896 blocksize, generation);
897 BUG_ON(!root->node);
898 return 0;
899}
900
901int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
902 struct btrfs_fs_info *fs_info)
903{
904 struct extent_buffer *eb;
905 struct btrfs_root *log_root_tree = fs_info->log_root_tree;
906 u64 start = 0;
907 u64 end = 0;
908 int ret;
909
910 if (!log_root_tree)
911 return 0;
912
913 while (1) {
914 ret = find_first_extent_bit(&log_root_tree->dirty_log_pages,
915 0, &start, &end, EXTENT_DIRTY);
916 if (ret)
917 break;
918
919 clear_extent_dirty(&log_root_tree->dirty_log_pages,
920 start, end, GFP_NOFS);
921 }
922 eb = fs_info->log_root_tree->node;
923
924 WARN_ON(btrfs_header_level(eb) != 0);
925 WARN_ON(btrfs_header_nritems(eb) != 0);
926
927 ret = btrfs_free_reserved_extent(fs_info->tree_root,
928 eb->start, eb->len);
929 BUG_ON(ret);
930
931 free_extent_buffer(eb);
932 kfree(fs_info->log_root_tree);
933 fs_info->log_root_tree = NULL;
934 return 0;
935}
936
937int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
938 struct btrfs_fs_info *fs_info)
939{
940 struct btrfs_root *root;
941 struct btrfs_root *tree_root = fs_info->tree_root;
942
943 root = kzalloc(sizeof(*root), GFP_NOFS);
944 if (!root)
945 return -ENOMEM;
946
947 __setup_root(tree_root->nodesize, tree_root->leafsize,
948 tree_root->sectorsize, tree_root->stripesize,
949 root, fs_info, BTRFS_TREE_LOG_OBJECTID);
950
951 root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
952 root->root_key.type = BTRFS_ROOT_ITEM_KEY;
953 root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
954 root->ref_cows = 0;
955
956 root->node = btrfs_alloc_free_block(trans, root, root->leafsize,
957 0, BTRFS_TREE_LOG_OBJECTID,
958 trans->transid, 0, 0, 0);
959
960 btrfs_set_header_nritems(root->node, 0);
961 btrfs_set_header_level(root->node, 0);
962 btrfs_set_header_bytenr(root->node, root->node->start);
963 btrfs_set_header_generation(root->node, trans->transid);
964 btrfs_set_header_owner(root->node, BTRFS_TREE_LOG_OBJECTID);
965
966 write_extent_buffer(root->node, root->fs_info->fsid,
967 (unsigned long)btrfs_header_fsid(root->node),
968 BTRFS_FSID_SIZE);
969 btrfs_mark_buffer_dirty(root->node);
970 btrfs_tree_unlock(root->node);
971 fs_info->log_root_tree = root;
972 return 0;
973}
974
975struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
976 struct btrfs_key *location)
977{
978 struct btrfs_root *root;
979 struct btrfs_fs_info *fs_info = tree_root->fs_info;
980 struct btrfs_path *path;
981 struct extent_buffer *l;
982 u64 highest_inode;
983 u64 generation;
984 u32 blocksize;
985 int ret = 0;
986
987 root = kzalloc(sizeof(*root), GFP_NOFS);
988 if (!root)
989 return ERR_PTR(-ENOMEM);
990 if (location->offset == (u64)-1) {
991 ret = find_and_setup_root(tree_root, fs_info,
992 location->objectid, root);
993 if (ret) {
994 kfree(root);
995 return ERR_PTR(ret);
996 }
997 goto insert;
998 }
999
1000 __setup_root(tree_root->nodesize, tree_root->leafsize,
1001 tree_root->sectorsize, tree_root->stripesize,
1002 root, fs_info, location->objectid);
1003
1004 path = btrfs_alloc_path();
1005 BUG_ON(!path);
1006 ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0);
1007 if (ret != 0) {
1008 if (ret > 0)
1009 ret = -ENOENT;
1010 goto out;
1011 }
1012 l = path->nodes[0];
1013 read_extent_buffer(l, &root->root_item,
1014 btrfs_item_ptr_offset(l, path->slots[0]),
1015 sizeof(root->root_item));
1016 memcpy(&root->root_key, location, sizeof(*location));
1017 ret = 0;
1018out:
1019 btrfs_release_path(root, path);
1020 btrfs_free_path(path);
1021 if (ret) {
1022 kfree(root);
1023 return ERR_PTR(ret);
1024 }
1025 generation = btrfs_root_generation(&root->root_item);
1026 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
1027 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
1028 blocksize, generation);
1029 BUG_ON(!root->node);
1030insert:
1031 if (location->objectid != BTRFS_TREE_LOG_OBJECTID) {
1032 root->ref_cows = 1;
1033 ret = btrfs_find_highest_inode(root, &highest_inode);
1034 if (ret == 0) {
1035 root->highest_inode = highest_inode;
1036 root->last_inode_alloc = highest_inode;
1037 }
1038 }
1039 return root;
1040}
1041
1042struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
1043 u64 root_objectid)
1044{
1045 struct btrfs_root *root;
1046
1047 if (root_objectid == BTRFS_ROOT_TREE_OBJECTID)
1048 return fs_info->tree_root;
1049 if (root_objectid == BTRFS_EXTENT_TREE_OBJECTID)
1050 return fs_info->extent_root;
1051
1052 root = radix_tree_lookup(&fs_info->fs_roots_radix,
1053 (unsigned long)root_objectid);
1054 return root;
1055}
1056
1057struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
1058 struct btrfs_key *location)
1059{
1060 struct btrfs_root *root;
1061 int ret;
1062
1063 if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
1064 return fs_info->tree_root;
1065 if (location->objectid == BTRFS_EXTENT_TREE_OBJECTID)
1066 return fs_info->extent_root;
1067 if (location->objectid == BTRFS_CHUNK_TREE_OBJECTID)
1068 return fs_info->chunk_root;
1069 if (location->objectid == BTRFS_DEV_TREE_OBJECTID)
1070 return fs_info->dev_root;
1071 if (location->objectid == BTRFS_CSUM_TREE_OBJECTID)
1072 return fs_info->csum_root;
1073
1074 root = radix_tree_lookup(&fs_info->fs_roots_radix,
1075 (unsigned long)location->objectid);
1076 if (root)
1077 return root;
1078
1079 root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location);
1080 if (IS_ERR(root))
1081 return root;
1082
1083 set_anon_super(&root->anon_super, NULL);
1084
1085 ret = radix_tree_insert(&fs_info->fs_roots_radix,
1086 (unsigned long)root->root_key.objectid,
1087 root);
1088 if (ret) {
1089 free_extent_buffer(root->node);
1090 kfree(root);
1091 return ERR_PTR(ret);
1092 }
1093 if (!(fs_info->sb->s_flags & MS_RDONLY)) {
1094 ret = btrfs_find_dead_roots(fs_info->tree_root,
1095 root->root_key.objectid, root);
1096 BUG_ON(ret);
1097 btrfs_orphan_cleanup(root);
1098 }
1099 return root;
1100}
1101
1102struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
1103 struct btrfs_key *location,
1104 const char *name, int namelen)
1105{
1106 struct btrfs_root *root;
1107 int ret;
1108
1109 root = btrfs_read_fs_root_no_name(fs_info, location);
1110 if (!root)
1111 return NULL;
1112
1113 if (root->in_sysfs)
1114 return root;
1115
1116 ret = btrfs_set_root_name(root, name, namelen);
1117 if (ret) {
1118 free_extent_buffer(root->node);
1119 kfree(root);
1120 return ERR_PTR(ret);
1121 }
1122#if 0
1123 ret = btrfs_sysfs_add_root(root);
1124 if (ret) {
1125 free_extent_buffer(root->node);
1126 kfree(root->name);
1127 kfree(root);
1128 return ERR_PTR(ret);
1129 }
1130#endif
1131 root->in_sysfs = 1;
1132 return root;
1133}
1134
1135static int btrfs_congested_fn(void *congested_data, int bdi_bits)
1136{
1137 struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data;
1138 int ret = 0;
1139 struct list_head *cur;
1140 struct btrfs_device *device;
1141 struct backing_dev_info *bdi;
1142#if 0
1143 if ((bdi_bits & (1 << BDI_write_congested)) &&
1144 btrfs_congested_async(info, 0))
1145 return 1;
1146#endif
1147 list_for_each(cur, &info->fs_devices->devices) {
1148 device = list_entry(cur, struct btrfs_device, dev_list);
1149 if (!device->bdev)
1150 continue;
1151 bdi = blk_get_backing_dev_info(device->bdev);
1152 if (bdi && bdi_congested(bdi, bdi_bits)) {
1153 ret = 1;
1154 break;
1155 }
1156 }
1157 return ret;
1158}
1159
1160/*
1161 * this unplugs every device on the box, and it is only used when page
1162 * is null
1163 */
1164static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
1165{
1166 struct list_head *cur;
1167 struct btrfs_device *device;
1168 struct btrfs_fs_info *info;
1169
1170 info = (struct btrfs_fs_info *)bdi->unplug_io_data;
1171 list_for_each(cur, &info->fs_devices->devices) {
1172 device = list_entry(cur, struct btrfs_device, dev_list);
1173 if (!device->bdev)
1174 continue;
1175
1176 bdi = blk_get_backing_dev_info(device->bdev);
1177 if (bdi->unplug_io_fn)
1178 bdi->unplug_io_fn(bdi, page);
1179 }
1180}
1181
1182static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
1183{
1184 struct inode *inode;
1185 struct extent_map_tree *em_tree;
1186 struct extent_map *em;
1187 struct address_space *mapping;
1188 u64 offset;
1189
1190 /* the generic O_DIRECT read code does this */
1191 if (1 || !page) {
1192 __unplug_io_fn(bdi, page);
1193 return;
1194 }
1195
1196 /*
1197 * page->mapping may change at any time. Get a consistent copy
1198 * and use that for everything below
1199 */
1200 smp_mb();
1201 mapping = page->mapping;
1202 if (!mapping)
1203 return;
1204
1205 inode = mapping->host;
1206
1207 /*
1208 * don't do the expensive searching for a small number of
1209 * devices
1210 */
1211 if (BTRFS_I(inode)->root->fs_info->fs_devices->open_devices <= 2) {
1212 __unplug_io_fn(bdi, page);
1213 return;
1214 }
1215
1216 offset = page_offset(page);
1217
1218 em_tree = &BTRFS_I(inode)->extent_tree;
1219 spin_lock(&em_tree->lock);
1220 em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
1221 spin_unlock(&em_tree->lock);
1222 if (!em) {
1223 __unplug_io_fn(bdi, page);
1224 return;
1225 }
1226
1227 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
1228 free_extent_map(em);
1229 __unplug_io_fn(bdi, page);
1230 return;
1231 }
1232 offset = offset - em->start;
1233 btrfs_unplug_page(&BTRFS_I(inode)->root->fs_info->mapping_tree,
1234 em->block_start + offset, page);
1235 free_extent_map(em);
1236}
1237
1238static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
1239{
1240 bdi_init(bdi);
1241 bdi->ra_pages = default_backing_dev_info.ra_pages;
1242 bdi->state = 0;
1243 bdi->capabilities = default_backing_dev_info.capabilities;
1244 bdi->unplug_io_fn = btrfs_unplug_io_fn;
1245 bdi->unplug_io_data = info;
1246 bdi->congested_fn = btrfs_congested_fn;
1247 bdi->congested_data = info;
1248 return 0;
1249}
1250
1251static int bio_ready_for_csum(struct bio *bio)
1252{
1253 u64 length = 0;
1254 u64 buf_len = 0;
1255 u64 start = 0;
1256 struct page *page;
1257 struct extent_io_tree *io_tree = NULL;
1258 struct btrfs_fs_info *info = NULL;
1259 struct bio_vec *bvec;
1260 int i;
1261 int ret;
1262
1263 bio_for_each_segment(bvec, bio, i) {
1264 page = bvec->bv_page;
1265 if (page->private == EXTENT_PAGE_PRIVATE) {
1266 length += bvec->bv_len;
1267 continue;
1268 }
1269 if (!page->private) {
1270 length += bvec->bv_len;
1271 continue;
1272 }
1273 length = bvec->bv_len;
1274 buf_len = page->private >> 2;
1275 start = page_offset(page) + bvec->bv_offset;
1276 io_tree = &BTRFS_I(page->mapping->host)->io_tree;
1277 info = BTRFS_I(page->mapping->host)->root->fs_info;
1278 }
1279 /* are we fully contained in this bio? */
1280 if (buf_len <= length)
1281 return 1;
1282
1283 ret = extent_range_uptodate(io_tree, start + length,
1284 start + buf_len - 1);
1285 if (ret == 1)
1286 return ret;
1287 return ret;
1288}
1289
1290/*
1291 * called by the kthread helper functions to finally call the bio end_io
1292 * functions. This is where read checksum verification actually happens
1293 */
1294static void end_workqueue_fn(struct btrfs_work *work)
1295{
1296 struct bio *bio;
1297 struct end_io_wq *end_io_wq;
1298 struct btrfs_fs_info *fs_info;
1299 int error;
1300
1301 end_io_wq = container_of(work, struct end_io_wq, work);
1302 bio = end_io_wq->bio;
1303 fs_info = end_io_wq->info;
1304
1305 /* metadata bio reads are special because the whole tree block must
1306 * be checksummed at once. This makes sure the entire block is in
1307 * ram and up to date before trying to verify things. For
1308 * blocksize <= pagesize, it is basically a noop
1309 */
1310 if (!(bio->bi_rw & (1 << BIO_RW)) && end_io_wq->metadata &&
1311 !bio_ready_for_csum(bio)) {
1312 btrfs_queue_worker(&fs_info->endio_meta_workers,
1313 &end_io_wq->work);
1314 return;
1315 }
1316 error = end_io_wq->error;
1317 bio->bi_private = end_io_wq->private;
1318 bio->bi_end_io = end_io_wq->end_io;
1319 kfree(end_io_wq);
1320 bio_endio(bio, error);
1321}
1322
1323static int cleaner_kthread(void *arg)
1324{
1325 struct btrfs_root *root = arg;
1326
1327 do {
1328 smp_mb();
1329 if (root->fs_info->closing)
1330 break;
1331
1332 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
1333 mutex_lock(&root->fs_info->cleaner_mutex);
1334 btrfs_clean_old_snapshots(root);
1335 mutex_unlock(&root->fs_info->cleaner_mutex);
1336
1337 if (freezing(current)) {
1338 refrigerator();
1339 } else {
1340 smp_mb();
1341 if (root->fs_info->closing)
1342 break;
1343 set_current_state(TASK_INTERRUPTIBLE);
1344 schedule();
1345 __set_current_state(TASK_RUNNING);
1346 }
1347 } while (!kthread_should_stop());
1348 return 0;
1349}
1350
1351static int transaction_kthread(void *arg)
1352{
1353 struct btrfs_root *root = arg;
1354 struct btrfs_trans_handle *trans;
1355 struct btrfs_transaction *cur;
1356 unsigned long now;
1357 unsigned long delay;
1358 int ret;
1359
1360 do {
1361 smp_mb();
1362 if (root->fs_info->closing)
1363 break;
1364
1365 delay = HZ * 30;
1366 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
1367 mutex_lock(&root->fs_info->transaction_kthread_mutex);
1368
1369 if (root->fs_info->total_ref_cache_size > 20 * 1024 * 1024) {
1370 printk(KERN_INFO "btrfs: total reference cache "
1371 "size %llu\n",
1372 root->fs_info->total_ref_cache_size);
1373 }
1374
1375 mutex_lock(&root->fs_info->trans_mutex);
1376 cur = root->fs_info->running_transaction;
1377 if (!cur) {
1378 mutex_unlock(&root->fs_info->trans_mutex);
1379 goto sleep;
1380 }
1381
1382 now = get_seconds();
1383 if (now < cur->start_time || now - cur->start_time < 30) {
1384 mutex_unlock(&root->fs_info->trans_mutex);
1385 delay = HZ * 5;
1386 goto sleep;
1387 }
1388 mutex_unlock(&root->fs_info->trans_mutex);
1389 trans = btrfs_start_transaction(root, 1);
1390 ret = btrfs_commit_transaction(trans, root);
1391sleep:
1392 wake_up_process(root->fs_info->cleaner_kthread);
1393 mutex_unlock(&root->fs_info->transaction_kthread_mutex);
1394
1395 if (freezing(current)) {
1396 refrigerator();
1397 } else {
1398 if (root->fs_info->closing)
1399 break;
1400 set_current_state(TASK_INTERRUPTIBLE);
1401 schedule_timeout(delay);
1402 __set_current_state(TASK_RUNNING);
1403 }
1404 } while (!kthread_should_stop());
1405 return 0;
1406}
1407
1408struct btrfs_root *open_ctree(struct super_block *sb,
1409 struct btrfs_fs_devices *fs_devices,
1410 char *options)
1411{
1412 u32 sectorsize;
1413 u32 nodesize;
1414 u32 leafsize;
1415 u32 blocksize;
1416 u32 stripesize;
1417 u64 generation;
1418 u64 features;
1419 struct btrfs_key location;
1420 struct buffer_head *bh;
1421 struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root),
1422 GFP_NOFS);
1423 struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root),
1424 GFP_NOFS);
1425 struct btrfs_root *tree_root = kzalloc(sizeof(struct btrfs_root),
1426 GFP_NOFS);
1427 struct btrfs_fs_info *fs_info = kzalloc(sizeof(*fs_info),
1428 GFP_NOFS);
1429 struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root),
1430 GFP_NOFS);
1431 struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root),
1432 GFP_NOFS);
1433 struct btrfs_root *log_tree_root;
1434
1435 int ret;
1436 int err = -EINVAL;
1437
1438 struct btrfs_super_block *disk_super;
1439
1440 if (!extent_root || !tree_root || !fs_info ||
1441 !chunk_root || !dev_root || !csum_root) {
1442 err = -ENOMEM;
1443 goto fail;
1444 }
1445 INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_NOFS);
1446 INIT_LIST_HEAD(&fs_info->trans_list);
1447 INIT_LIST_HEAD(&fs_info->dead_roots);
1448 INIT_LIST_HEAD(&fs_info->hashers);
1449 INIT_LIST_HEAD(&fs_info->delalloc_inodes);
1450 spin_lock_init(&fs_info->hash_lock);
1451 spin_lock_init(&fs_info->delalloc_lock);
1452 spin_lock_init(&fs_info->new_trans_lock);
1453 spin_lock_init(&fs_info->ref_cache_lock);
1454
1455 init_completion(&fs_info->kobj_unregister);
1456 fs_info->tree_root = tree_root;
1457 fs_info->extent_root = extent_root;
1458 fs_info->csum_root = csum_root;
1459 fs_info->chunk_root = chunk_root;
1460 fs_info->dev_root = dev_root;
1461 fs_info->fs_devices = fs_devices;
1462 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
1463 INIT_LIST_HEAD(&fs_info->space_info);
1464 btrfs_mapping_init(&fs_info->mapping_tree);
1465 atomic_set(&fs_info->nr_async_submits, 0);
1466 atomic_set(&fs_info->async_delalloc_pages, 0);
1467 atomic_set(&fs_info->async_submit_draining, 0);
1468 atomic_set(&fs_info->nr_async_bios, 0);
1469 atomic_set(&fs_info->throttles, 0);
1470 atomic_set(&fs_info->throttle_gen, 0);
1471 fs_info->sb = sb;
1472 fs_info->max_extent = (u64)-1;
1473 fs_info->max_inline = 8192 * 1024;
1474 setup_bdi(fs_info, &fs_info->bdi);
1475 fs_info->btree_inode = new_inode(sb);
1476 fs_info->btree_inode->i_ino = 1;
1477 fs_info->btree_inode->i_nlink = 1;
1478
1479 fs_info->thread_pool_size = min_t(unsigned long,
1480 num_online_cpus() + 2, 8);
1481
1482 INIT_LIST_HEAD(&fs_info->ordered_extents);
1483 spin_lock_init(&fs_info->ordered_extent_lock);
1484
1485 sb->s_blocksize = 4096;
1486 sb->s_blocksize_bits = blksize_bits(4096);
1487
1488 /*
1489 * we set the i_size on the btree inode to the max possible int.
1490 * the real end of the address space is determined by all of
1491 * the devices in the system
1492 */
1493 fs_info->btree_inode->i_size = OFFSET_MAX;
1494 fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
1495 fs_info->btree_inode->i_mapping->backing_dev_info = &fs_info->bdi;
1496
1497 extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
1498 fs_info->btree_inode->i_mapping,
1499 GFP_NOFS);
1500 extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree,
1501 GFP_NOFS);
1502
1503 BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
1504
1505 spin_lock_init(&fs_info->block_group_cache_lock);
1506 fs_info->block_group_cache_tree.rb_node = NULL;
1507
1508 extent_io_tree_init(&fs_info->pinned_extents,
1509 fs_info->btree_inode->i_mapping, GFP_NOFS);
1510 extent_io_tree_init(&fs_info->pending_del,
1511 fs_info->btree_inode->i_mapping, GFP_NOFS);
1512 extent_io_tree_init(&fs_info->extent_ins,
1513 fs_info->btree_inode->i_mapping, GFP_NOFS);
1514 fs_info->do_barriers = 1;
1515
1516 INIT_LIST_HEAD(&fs_info->dead_reloc_roots);
1517 btrfs_leaf_ref_tree_init(&fs_info->reloc_ref_tree);
1518 btrfs_leaf_ref_tree_init(&fs_info->shared_ref_tree);
1519
1520 BTRFS_I(fs_info->btree_inode)->root = tree_root;
1521 memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
1522 sizeof(struct btrfs_key));
1523 insert_inode_hash(fs_info->btree_inode);
1524
1525 mutex_init(&fs_info->trans_mutex);
1526 mutex_init(&fs_info->tree_log_mutex);
1527 mutex_init(&fs_info->drop_mutex);
1528 mutex_init(&fs_info->extent_ins_mutex);
1529 mutex_init(&fs_info->pinned_mutex);
1530 mutex_init(&fs_info->chunk_mutex);
1531 mutex_init(&fs_info->transaction_kthread_mutex);
1532 mutex_init(&fs_info->cleaner_mutex);
1533 mutex_init(&fs_info->volume_mutex);
1534 mutex_init(&fs_info->tree_reloc_mutex);
1535 init_waitqueue_head(&fs_info->transaction_throttle);
1536 init_waitqueue_head(&fs_info->transaction_wait);
1537 init_waitqueue_head(&fs_info->async_submit_wait);
1538 init_waitqueue_head(&fs_info->tree_log_wait);
1539 atomic_set(&fs_info->tree_log_commit, 0);
1540 atomic_set(&fs_info->tree_log_writers, 0);
1541 fs_info->tree_log_transid = 0;
1542
1543 __setup_root(4096, 4096, 4096, 4096, tree_root,
1544 fs_info, BTRFS_ROOT_TREE_OBJECTID);
1545
1546
1547 bh = btrfs_read_dev_super(fs_devices->latest_bdev);
1548 if (!bh)
1549 goto fail_iput;
1550
1551 memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy));
1552 memcpy(&fs_info->super_for_commit, &fs_info->super_copy,
1553 sizeof(fs_info->super_for_commit));
1554 brelse(bh);
1555
1556 memcpy(fs_info->fsid, fs_info->super_copy.fsid, BTRFS_FSID_SIZE);
1557
1558 disk_super = &fs_info->super_copy;
1559 if (!btrfs_super_root(disk_super))
1560 goto fail_iput;
1561
1562 ret = btrfs_parse_options(tree_root, options);
1563 if (ret) {
1564 err = ret;
1565 goto fail_iput;
1566 }
1567
1568 features = btrfs_super_incompat_flags(disk_super) &
1569 ~BTRFS_FEATURE_INCOMPAT_SUPP;
1570 if (features) {
1571 printk(KERN_ERR "BTRFS: couldn't mount because of "
1572 "unsupported optional features (%Lx).\n",
1573 features);
1574 err = -EINVAL;
1575 goto fail_iput;
1576 }
1577
1578 features = btrfs_super_compat_ro_flags(disk_super) &
1579 ~BTRFS_FEATURE_COMPAT_RO_SUPP;
1580 if (!(sb->s_flags & MS_RDONLY) && features) {
1581 printk(KERN_ERR "BTRFS: couldn't mount RDWR because of "
1582 "unsupported option features (%Lx).\n",
1583 features);
1584 err = -EINVAL;
1585 goto fail_iput;
1586 }
1587
1588 /*
1589 * we need to start all the end_io workers up front because the
1590 * queue work function gets called at interrupt time, and so it
1591 * cannot dynamically grow.
1592 */
1593 btrfs_init_workers(&fs_info->workers, "worker",
1594 fs_info->thread_pool_size);
1595
1596 btrfs_init_workers(&fs_info->delalloc_workers, "delalloc",
1597 fs_info->thread_pool_size);
1598
1599 btrfs_init_workers(&fs_info->submit_workers, "submit",
1600 min_t(u64, fs_devices->num_devices,
1601 fs_info->thread_pool_size));
1602
1603 /* a higher idle thresh on the submit workers makes it much more
1604 * likely that bios will be send down in a sane order to the
1605 * devices
1606 */
1607 fs_info->submit_workers.idle_thresh = 64;
1608
1609 fs_info->workers.idle_thresh = 16;
1610 fs_info->workers.ordered = 1;
1611
1612 fs_info->delalloc_workers.idle_thresh = 2;
1613 fs_info->delalloc_workers.ordered = 1;
1614
1615 btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1);
1616 btrfs_init_workers(&fs_info->endio_workers, "endio",
1617 fs_info->thread_pool_size);
1618 btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta",
1619 fs_info->thread_pool_size);
1620 btrfs_init_workers(&fs_info->endio_meta_write_workers,
1621 "endio-meta-write", fs_info->thread_pool_size);
1622 btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
1623 fs_info->thread_pool_size);
1624
1625 /*
1626 * endios are largely parallel and should have a very
1627 * low idle thresh
1628 */
1629 fs_info->endio_workers.idle_thresh = 4;
1630 fs_info->endio_write_workers.idle_thresh = 64;
1631 fs_info->endio_meta_write_workers.idle_thresh = 64;
1632
1633 btrfs_start_workers(&fs_info->workers, 1);
1634 btrfs_start_workers(&fs_info->submit_workers, 1);
1635 btrfs_start_workers(&fs_info->delalloc_workers, 1);
1636 btrfs_start_workers(&fs_info->fixup_workers, 1);
1637 btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
1638 btrfs_start_workers(&fs_info->endio_meta_workers,
1639 fs_info->thread_pool_size);
1640 btrfs_start_workers(&fs_info->endio_meta_write_workers,
1641 fs_info->thread_pool_size);
1642 btrfs_start_workers(&fs_info->endio_write_workers,
1643 fs_info->thread_pool_size);
1644
1645 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
1646 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
1647 4 * 1024 * 1024 / PAGE_CACHE_SIZE);
1648
1649 nodesize = btrfs_super_nodesize(disk_super);
1650 leafsize = btrfs_super_leafsize(disk_super);
1651 sectorsize = btrfs_super_sectorsize(disk_super);
1652 stripesize = btrfs_super_stripesize(disk_super);
1653 tree_root->nodesize = nodesize;
1654 tree_root->leafsize = leafsize;
1655 tree_root->sectorsize = sectorsize;
1656 tree_root->stripesize = stripesize;
1657
1658 sb->s_blocksize = sectorsize;
1659 sb->s_blocksize_bits = blksize_bits(sectorsize);
1660
1661 if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
1662 sizeof(disk_super->magic))) {
1663 printk(KERN_INFO "btrfs: valid FS not found on %s\n", sb->s_id);
1664 goto fail_sb_buffer;
1665 }
1666
1667 mutex_lock(&fs_info->chunk_mutex);
1668 ret = btrfs_read_sys_array(tree_root);
1669 mutex_unlock(&fs_info->chunk_mutex);
1670 if (ret) {
1671 printk(KERN_WARNING "btrfs: failed to read the system "
1672 "array on %s\n", sb->s_id);
1673 goto fail_sys_array;
1674 }
1675
1676 blocksize = btrfs_level_size(tree_root,
1677 btrfs_super_chunk_root_level(disk_super));
1678 generation = btrfs_super_chunk_root_generation(disk_super);
1679
1680 __setup_root(nodesize, leafsize, sectorsize, stripesize,
1681 chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID);
1682
1683 chunk_root->node = read_tree_block(chunk_root,
1684 btrfs_super_chunk_root(disk_super),
1685 blocksize, generation);
1686 BUG_ON(!chunk_root->node);
1687
1688 read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
1689 (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node),
1690 BTRFS_UUID_SIZE);
1691
1692 mutex_lock(&fs_info->chunk_mutex);
1693 ret = btrfs_read_chunk_tree(chunk_root);
1694 mutex_unlock(&fs_info->chunk_mutex);
1695 if (ret) {
1696 printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n",
1697 sb->s_id);
1698 goto fail_chunk_root;
1699 }
1700
1701 btrfs_close_extra_devices(fs_devices);
1702
1703 blocksize = btrfs_level_size(tree_root,
1704 btrfs_super_root_level(disk_super));
1705 generation = btrfs_super_generation(disk_super);
1706
1707 tree_root->node = read_tree_block(tree_root,
1708 btrfs_super_root(disk_super),
1709 blocksize, generation);
1710 if (!tree_root->node)
1711 goto fail_chunk_root;
1712
1713
1714 ret = find_and_setup_root(tree_root, fs_info,
1715 BTRFS_EXTENT_TREE_OBJECTID, extent_root);
1716 if (ret)
1717 goto fail_tree_root;
1718 extent_root->track_dirty = 1;
1719
1720 ret = find_and_setup_root(tree_root, fs_info,
1721 BTRFS_DEV_TREE_OBJECTID, dev_root);
1722 dev_root->track_dirty = 1;
1723
1724 if (ret)
1725 goto fail_extent_root;
1726
1727 ret = find_and_setup_root(tree_root, fs_info,
1728 BTRFS_CSUM_TREE_OBJECTID, csum_root);
1729 if (ret)
1730 goto fail_extent_root;
1731
1732 csum_root->track_dirty = 1;
1733
1734 btrfs_read_block_groups(extent_root);
1735
1736 fs_info->generation = generation;
1737 fs_info->last_trans_committed = generation;
1738 fs_info->data_alloc_profile = (u64)-1;
1739 fs_info->metadata_alloc_profile = (u64)-1;
1740 fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
1741 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
1742 "btrfs-cleaner");
1743 if (!fs_info->cleaner_kthread)
1744 goto fail_csum_root;
1745
1746 fs_info->transaction_kthread = kthread_run(transaction_kthread,
1747 tree_root,
1748 "btrfs-transaction");
1749 if (!fs_info->transaction_kthread)
1750 goto fail_cleaner;
1751
1752 if (btrfs_super_log_root(disk_super) != 0) {
1753 u64 bytenr = btrfs_super_log_root(disk_super);
1754
1755 if (fs_devices->rw_devices == 0) {
1756 printk(KERN_WARNING "Btrfs log replay required "
1757 "on RO media\n");
1758 err = -EIO;
1759 goto fail_trans_kthread;
1760 }
1761 blocksize =
1762 btrfs_level_size(tree_root,
1763 btrfs_super_log_root_level(disk_super));
1764
1765 log_tree_root = kzalloc(sizeof(struct btrfs_root),
1766 GFP_NOFS);
1767
1768 __setup_root(nodesize, leafsize, sectorsize, stripesize,
1769 log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID);
1770
1771 log_tree_root->node = read_tree_block(tree_root, bytenr,
1772 blocksize,
1773 generation + 1);
1774 ret = btrfs_recover_log_trees(log_tree_root);
1775 BUG_ON(ret);
1776
1777 if (sb->s_flags & MS_RDONLY) {
1778 ret = btrfs_commit_super(tree_root);
1779 BUG_ON(ret);
1780 }
1781 }
1782
1783 if (!(sb->s_flags & MS_RDONLY)) {
1784 ret = btrfs_cleanup_reloc_trees(tree_root);
1785 BUG_ON(ret);
1786 }
1787
1788 location.objectid = BTRFS_FS_TREE_OBJECTID;
1789 location.type = BTRFS_ROOT_ITEM_KEY;
1790 location.offset = (u64)-1;
1791
1792 fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
1793 if (!fs_info->fs_root)
1794 goto fail_trans_kthread;
1795 return tree_root;
1796
1797fail_trans_kthread:
1798 kthread_stop(fs_info->transaction_kthread);
1799fail_cleaner:
1800 kthread_stop(fs_info->cleaner_kthread);
1801
1802 /*
1803 * make sure we're done with the btree inode before we stop our
1804 * kthreads
1805 */
1806 filemap_write_and_wait(fs_info->btree_inode->i_mapping);
1807 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
1808
1809fail_csum_root:
1810 free_extent_buffer(csum_root->node);
1811fail_extent_root:
1812 free_extent_buffer(extent_root->node);
1813fail_tree_root:
1814 free_extent_buffer(tree_root->node);
1815fail_chunk_root:
1816 free_extent_buffer(chunk_root->node);
1817fail_sys_array:
1818 free_extent_buffer(dev_root->node);
1819fail_sb_buffer:
1820 btrfs_stop_workers(&fs_info->fixup_workers);
1821 btrfs_stop_workers(&fs_info->delalloc_workers);
1822 btrfs_stop_workers(&fs_info->workers);
1823 btrfs_stop_workers(&fs_info->endio_workers);
1824 btrfs_stop_workers(&fs_info->endio_meta_workers);
1825 btrfs_stop_workers(&fs_info->endio_meta_write_workers);
1826 btrfs_stop_workers(&fs_info->endio_write_workers);
1827 btrfs_stop_workers(&fs_info->submit_workers);
1828fail_iput:
1829 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
1830 iput(fs_info->btree_inode);
1831fail:
1832 btrfs_close_devices(fs_info->fs_devices);
1833 btrfs_mapping_tree_free(&fs_info->mapping_tree);
1834
1835 kfree(extent_root);
1836 kfree(tree_root);
1837 bdi_destroy(&fs_info->bdi);
1838 kfree(fs_info);
1839 kfree(chunk_root);
1840 kfree(dev_root);
1841 kfree(csum_root);
1842 return ERR_PTR(err);
1843}
1844
1845static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
1846{
1847 char b[BDEVNAME_SIZE];
1848
1849 if (uptodate) {
1850 set_buffer_uptodate(bh);
1851 } else {
1852 if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
1853 printk(KERN_WARNING "lost page write due to "
1854 "I/O error on %s\n",
1855 bdevname(bh->b_bdev, b));
1856 }
1857 /* note, we dont' set_buffer_write_io_error because we have
1858 * our own ways of dealing with the IO errors
1859 */
1860 clear_buffer_uptodate(bh);
1861 }
1862 unlock_buffer(bh);
1863 put_bh(bh);
1864}
1865
1866struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
1867{
1868 struct buffer_head *bh;
1869 struct buffer_head *latest = NULL;
1870 struct btrfs_super_block *super;
1871 int i;
1872 u64 transid = 0;
1873 u64 bytenr;
1874
1875 /* we would like to check all the supers, but that would make
1876 * a btrfs mount succeed after a mkfs from a different FS.
1877 * So, we need to add a special mount option to scan for
1878 * later supers, using BTRFS_SUPER_MIRROR_MAX instead
1879 */
1880 for (i = 0; i < 1; i++) {
1881 bytenr = btrfs_sb_offset(i);
1882 if (bytenr + 4096 >= i_size_read(bdev->bd_inode))
1883 break;
1884 bh = __bread(bdev, bytenr / 4096, 4096);
1885 if (!bh)
1886 continue;
1887
1888 super = (struct btrfs_super_block *)bh->b_data;
1889 if (btrfs_super_bytenr(super) != bytenr ||
1890 strncmp((char *)(&super->magic), BTRFS_MAGIC,
1891 sizeof(super->magic))) {
1892 brelse(bh);
1893 continue;
1894 }
1895
1896 if (!latest || btrfs_super_generation(super) > transid) {
1897 brelse(latest);
1898 latest = bh;
1899 transid = btrfs_super_generation(super);
1900 } else {
1901 brelse(bh);
1902 }
1903 }
1904 return latest;
1905}
1906
1907static int write_dev_supers(struct btrfs_device *device,
1908 struct btrfs_super_block *sb,
1909 int do_barriers, int wait, int max_mirrors)
1910{
1911 struct buffer_head *bh;
1912 int i;
1913 int ret;
1914 int errors = 0;
1915 u32 crc;
1916 u64 bytenr;
1917 int last_barrier = 0;
1918
1919 if (max_mirrors == 0)
1920 max_mirrors = BTRFS_SUPER_MIRROR_MAX;
1921
1922 /* make sure only the last submit_bh does a barrier */
1923 if (do_barriers) {
1924 for (i = 0; i < max_mirrors; i++) {
1925 bytenr = btrfs_sb_offset(i);
1926 if (bytenr + BTRFS_SUPER_INFO_SIZE >=
1927 device->total_bytes)
1928 break;
1929 last_barrier = i;
1930 }
1931 }
1932
1933 for (i = 0; i < max_mirrors; i++) {
1934 bytenr = btrfs_sb_offset(i);
1935 if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes)
1936 break;
1937
1938 if (wait) {
1939 bh = __find_get_block(device->bdev, bytenr / 4096,
1940 BTRFS_SUPER_INFO_SIZE);
1941 BUG_ON(!bh);
1942 brelse(bh);
1943 wait_on_buffer(bh);
1944 if (buffer_uptodate(bh)) {
1945 brelse(bh);
1946 continue;
1947 }
1948 } else {
1949 btrfs_set_super_bytenr(sb, bytenr);
1950
1951 crc = ~(u32)0;
1952 crc = btrfs_csum_data(NULL, (char *)sb +
1953 BTRFS_CSUM_SIZE, crc,
1954 BTRFS_SUPER_INFO_SIZE -
1955 BTRFS_CSUM_SIZE);
1956 btrfs_csum_final(crc, sb->csum);
1957
1958 bh = __getblk(device->bdev, bytenr / 4096,
1959 BTRFS_SUPER_INFO_SIZE);
1960 memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE);
1961
1962 set_buffer_uptodate(bh);
1963 get_bh(bh);
1964 lock_buffer(bh);
1965 bh->b_end_io = btrfs_end_buffer_write_sync;
1966 }
1967
1968 if (i == last_barrier && do_barriers && device->barriers) {
1969 ret = submit_bh(WRITE_BARRIER, bh);
1970 if (ret == -EOPNOTSUPP) {
1971 printk("btrfs: disabling barriers on dev %s\n",
1972 device->name);
1973 set_buffer_uptodate(bh);
1974 device->barriers = 0;
1975 get_bh(bh);
1976 lock_buffer(bh);
1977 ret = submit_bh(WRITE, bh);
1978 }
1979 } else {
1980 ret = submit_bh(WRITE, bh);
1981 }
1982
1983 if (!ret && wait) {
1984 wait_on_buffer(bh);
1985 if (!buffer_uptodate(bh))
1986 errors++;
1987 } else if (ret) {
1988 errors++;
1989 }
1990 if (wait)
1991 brelse(bh);
1992 }
1993 return errors < i ? 0 : -1;
1994}
1995
1996int write_all_supers(struct btrfs_root *root, int max_mirrors)
1997{
1998 struct list_head *cur;
1999 struct list_head *head = &root->fs_info->fs_devices->devices;
2000 struct btrfs_device *dev;
2001 struct btrfs_super_block *sb;
2002 struct btrfs_dev_item *dev_item;
2003 int ret;
2004 int do_barriers;
2005 int max_errors;
2006 int total_errors = 0;
2007 u64 flags;
2008
2009 max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
2010 do_barriers = !btrfs_test_opt(root, NOBARRIER);
2011
2012 sb = &root->fs_info->super_for_commit;
2013 dev_item = &sb->dev_item;
2014 list_for_each(cur, head) {
2015 dev = list_entry(cur, struct btrfs_device, dev_list);
2016 if (!dev->bdev) {
2017 total_errors++;
2018 continue;
2019 }
2020 if (!dev->in_fs_metadata || !dev->writeable)
2021 continue;
2022
2023 btrfs_set_stack_device_generation(dev_item, 0);
2024 btrfs_set_stack_device_type(dev_item, dev->type);
2025 btrfs_set_stack_device_id(dev_item, dev->devid);
2026 btrfs_set_stack_device_total_bytes(dev_item, dev->total_bytes);
2027 btrfs_set_stack_device_bytes_used(dev_item, dev->bytes_used);
2028 btrfs_set_stack_device_io_align(dev_item, dev->io_align);
2029 btrfs_set_stack_device_io_width(dev_item, dev->io_width);
2030 btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
2031 memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE);
2032 memcpy(dev_item->fsid, dev->fs_devices->fsid, BTRFS_UUID_SIZE);
2033
2034 flags = btrfs_super_flags(sb);
2035 btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN);
2036
2037 ret = write_dev_supers(dev, sb, do_barriers, 0, max_mirrors);
2038 if (ret)
2039 total_errors++;
2040 }
2041 if (total_errors > max_errors) {
2042 printk(KERN_ERR "btrfs: %d errors while writing supers\n",
2043 total_errors);
2044 BUG();
2045 }
2046
2047 total_errors = 0;
2048 list_for_each(cur, head) {
2049 dev = list_entry(cur, struct btrfs_device, dev_list);
2050 if (!dev->bdev)
2051 continue;
2052 if (!dev->in_fs_metadata || !dev->writeable)
2053 continue;
2054
2055 ret = write_dev_supers(dev, sb, do_barriers, 1, max_mirrors);
2056 if (ret)
2057 total_errors++;
2058 }
2059 if (total_errors > max_errors) {
2060 printk(KERN_ERR "btrfs: %d errors while writing supers\n",
2061 total_errors);
2062 BUG();
2063 }
2064 return 0;
2065}
2066
2067int write_ctree_super(struct btrfs_trans_handle *trans,
2068 struct btrfs_root *root, int max_mirrors)
2069{
2070 int ret;
2071
2072 ret = write_all_supers(root, max_mirrors);
2073 return ret;
2074}
2075
2076int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
2077{
2078 radix_tree_delete(&fs_info->fs_roots_radix,
2079 (unsigned long)root->root_key.objectid);
2080 if (root->anon_super.s_dev) {
2081 down_write(&root->anon_super.s_umount);
2082 kill_anon_super(&root->anon_super);
2083 }
2084 if (root->node)
2085 free_extent_buffer(root->node);
2086 if (root->commit_root)
2087 free_extent_buffer(root->commit_root);
2088 kfree(root->name);
2089 kfree(root);
2090 return 0;
2091}
2092
2093static int del_fs_roots(struct btrfs_fs_info *fs_info)
2094{
2095 int ret;
2096 struct btrfs_root *gang[8];
2097 int i;
2098
2099 while (1) {
2100 ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
2101 (void **)gang, 0,
2102 ARRAY_SIZE(gang));
2103 if (!ret)
2104 break;
2105 for (i = 0; i < ret; i++)
2106 btrfs_free_fs_root(fs_info, gang[i]);
2107 }
2108 return 0;
2109}
2110
2111int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
2112{
2113 u64 root_objectid = 0;
2114 struct btrfs_root *gang[8];
2115 int i;
2116 int ret;
2117
2118 while (1) {
2119 ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
2120 (void **)gang, root_objectid,
2121 ARRAY_SIZE(gang));
2122 if (!ret)
2123 break;
2124 for (i = 0; i < ret; i++) {
2125 root_objectid = gang[i]->root_key.objectid;
2126 ret = btrfs_find_dead_roots(fs_info->tree_root,
2127 root_objectid, gang[i]);
2128 BUG_ON(ret);
2129 btrfs_orphan_cleanup(gang[i]);
2130 }
2131 root_objectid++;
2132 }
2133 return 0;
2134}
2135
2136int btrfs_commit_super(struct btrfs_root *root)
2137{
2138 struct btrfs_trans_handle *trans;
2139 int ret;
2140
2141 mutex_lock(&root->fs_info->cleaner_mutex);
2142 btrfs_clean_old_snapshots(root);
2143 mutex_unlock(&root->fs_info->cleaner_mutex);
2144 trans = btrfs_start_transaction(root, 1);
2145 ret = btrfs_commit_transaction(trans, root);
2146 BUG_ON(ret);
2147 /* run commit again to drop the original snapshot */
2148 trans = btrfs_start_transaction(root, 1);
2149 btrfs_commit_transaction(trans, root);
2150 ret = btrfs_write_and_wait_transaction(NULL, root);
2151 BUG_ON(ret);
2152
2153 ret = write_ctree_super(NULL, root, 0);
2154 return ret;
2155}
2156
2157int close_ctree(struct btrfs_root *root)
2158{
2159 struct btrfs_fs_info *fs_info = root->fs_info;
2160 int ret;
2161
2162 fs_info->closing = 1;
2163 smp_mb();
2164
2165 kthread_stop(root->fs_info->transaction_kthread);
2166 kthread_stop(root->fs_info->cleaner_kthread);
2167
2168 if (!(fs_info->sb->s_flags & MS_RDONLY)) {
2169 ret = btrfs_commit_super(root);
2170 if (ret)
2171 printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
2172 }
2173
2174 if (fs_info->delalloc_bytes) {
2175 printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",
2176 fs_info->delalloc_bytes);
2177 }
2178 if (fs_info->total_ref_cache_size) {
2179 printk(KERN_INFO "btrfs: at umount reference cache size %llu\n",
2180 (unsigned long long)fs_info->total_ref_cache_size);
2181 }
2182
2183 if (fs_info->extent_root->node)
2184 free_extent_buffer(fs_info->extent_root->node);
2185
2186 if (fs_info->tree_root->node)
2187 free_extent_buffer(fs_info->tree_root->node);
2188
2189 if (root->fs_info->chunk_root->node)
2190 free_extent_buffer(root->fs_info->chunk_root->node);
2191
2192 if (root->fs_info->dev_root->node)
2193 free_extent_buffer(root->fs_info->dev_root->node);
2194
2195 if (root->fs_info->csum_root->node)
2196 free_extent_buffer(root->fs_info->csum_root->node);
2197
2198 btrfs_free_block_groups(root->fs_info);
2199
2200 del_fs_roots(fs_info);
2201
2202 iput(fs_info->btree_inode);
2203
2204 btrfs_stop_workers(&fs_info->fixup_workers);
2205 btrfs_stop_workers(&fs_info->delalloc_workers);
2206 btrfs_stop_workers(&fs_info->workers);
2207 btrfs_stop_workers(&fs_info->endio_workers);
2208 btrfs_stop_workers(&fs_info->endio_meta_workers);
2209 btrfs_stop_workers(&fs_info->endio_meta_write_workers);
2210 btrfs_stop_workers(&fs_info->endio_write_workers);
2211 btrfs_stop_workers(&fs_info->submit_workers);
2212
2213#if 0
2214 while (!list_empty(&fs_info->hashers)) {
2215 struct btrfs_hasher *hasher;
2216 hasher = list_entry(fs_info->hashers.next, struct btrfs_hasher,
2217 hashers);
2218 list_del(&hasher->hashers);
2219 crypto_free_hash(&fs_info->hash_tfm);
2220 kfree(hasher);
2221 }
2222#endif
2223 btrfs_close_devices(fs_info->fs_devices);
2224 btrfs_mapping_tree_free(&fs_info->mapping_tree);
2225
2226 bdi_destroy(&fs_info->bdi);
2227
2228 kfree(fs_info->extent_root);
2229 kfree(fs_info->tree_root);
2230 kfree(fs_info->chunk_root);
2231 kfree(fs_info->dev_root);
2232 kfree(fs_info->csum_root);
2233 return 0;
2234}
2235
2236int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid)
2237{
2238 int ret;
2239 struct inode *btree_inode = buf->first_page->mapping->host;
2240
2241 ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf);
2242 if (!ret)
2243 return ret;
2244
2245 ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf,
2246 parent_transid);
2247 return !ret;
2248}
2249
2250int btrfs_set_buffer_uptodate(struct extent_buffer *buf)
2251{
2252 struct inode *btree_inode = buf->first_page->mapping->host;
2253 return set_extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree,
2254 buf);
2255}
2256
2257void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
2258{
2259 struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
2260 u64 transid = btrfs_header_generation(buf);
2261 struct inode *btree_inode = root->fs_info->btree_inode;
2262
2263 WARN_ON(!btrfs_tree_locked(buf));
2264 if (transid != root->fs_info->generation) {
2265 printk(KERN_CRIT "btrfs transid mismatch buffer %llu, "
2266 "found %llu running %llu\n",
2267 (unsigned long long)buf->start,
2268 (unsigned long long)transid,
2269 (unsigned long long)root->fs_info->generation);
2270 WARN_ON(1);
2271 }
2272 set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, buf);
2273}
2274
2275void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
2276{
2277 /*
2278 * looks as though older kernels can get into trouble with
2279 * this code, they end up stuck in balance_dirty_pages forever
2280 */
2281 struct extent_io_tree *tree;
2282 u64 num_dirty;
2283 u64 start = 0;
2284 unsigned long thresh = 32 * 1024 * 1024;
2285 tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
2286
2287 if (current_is_pdflush() || current->flags & PF_MEMALLOC)
2288 return;
2289
2290 num_dirty = count_range_bits(tree, &start, (u64)-1,
2291 thresh, EXTENT_DIRTY);
2292 if (num_dirty > thresh) {
2293 balance_dirty_pages_ratelimited_nr(
2294 root->fs_info->btree_inode->i_mapping, 1);
2295 }
2296 return;
2297}
2298
2299int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
2300{
2301 struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
2302 int ret;
2303 ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
2304 if (ret == 0)
2305 buf->flags |= EXTENT_UPTODATE;
2306 return ret;
2307}
2308
2309int btree_lock_page_hook(struct page *page)
2310{
2311 struct inode *inode = page->mapping->host;
2312 struct btrfs_root *root = BTRFS_I(inode)->root;
2313 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2314 struct extent_buffer *eb;
2315 unsigned long len;
2316 u64 bytenr = page_offset(page);
2317
2318 if (page->private == EXTENT_PAGE_PRIVATE)
2319 goto out;
2320
2321 len = page->private >> 2;
2322 eb = find_extent_buffer(io_tree, bytenr, len, GFP_NOFS);
2323 if (!eb)
2324 goto out;
2325
2326 btrfs_tree_lock(eb);
2327 spin_lock(&root->fs_info->hash_lock);
2328 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
2329 spin_unlock(&root->fs_info->hash_lock);
2330 btrfs_tree_unlock(eb);
2331 free_extent_buffer(eb);
2332out:
2333 lock_page(page);
2334 return 0;
2335}
2336
2337static struct extent_io_ops btree_extent_io_ops = {
2338 .write_cache_pages_lock_hook = btree_lock_page_hook,
2339 .readpage_end_io_hook = btree_readpage_end_io_hook,
2340 .submit_bio_hook = btree_submit_bio_hook,
2341 /* note we're sharing with inode.c for the merge bio hook */
2342 .merge_bio_hook = btrfs_merge_bio_hook,
2343};
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
new file mode 100644
index 000000000000..c0ff404c31b7
--- /dev/null
+++ b/fs/btrfs/disk-io.h
@@ -0,0 +1,102 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __DISKIO__
20#define __DISKIO__
21
22#define BTRFS_SUPER_INFO_OFFSET (64 * 1024)
23#define BTRFS_SUPER_INFO_SIZE 4096
24
25#define BTRFS_SUPER_MIRROR_MAX 3
26#define BTRFS_SUPER_MIRROR_SHIFT 12
27
28static inline u64 btrfs_sb_offset(int mirror)
29{
30 u64 start = 16 * 1024;
31 if (mirror)
32 return start << (BTRFS_SUPER_MIRROR_SHIFT * mirror);
33 return BTRFS_SUPER_INFO_OFFSET;
34}
35
36struct btrfs_device;
37struct btrfs_fs_devices;
38
39struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
40 u32 blocksize, u64 parent_transid);
41int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
42 u64 parent_transid);
43struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
44 u64 bytenr, u32 blocksize);
45int clean_tree_block(struct btrfs_trans_handle *trans,
46 struct btrfs_root *root, struct extent_buffer *buf);
47struct btrfs_root *open_ctree(struct super_block *sb,
48 struct btrfs_fs_devices *fs_devices,
49 char *options);
50int close_ctree(struct btrfs_root *root);
51int write_ctree_super(struct btrfs_trans_handle *trans,
52 struct btrfs_root *root, int max_mirrors);
53struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
54int btrfs_commit_super(struct btrfs_root *root);
55struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
56 u64 bytenr, u32 blocksize);
57struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
58 u64 root_objectid);
59struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
60 struct btrfs_key *location,
61 const char *name, int namelen);
62struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
63 struct btrfs_key *location);
64struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
65 struct btrfs_key *location);
66int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
67int btrfs_insert_dev_radix(struct btrfs_root *root,
68 struct block_device *bdev,
69 u64 device_id,
70 u64 block_start,
71 u64 num_blocks);
72void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
73int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
74void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
75int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid);
76int btrfs_set_buffer_uptodate(struct extent_buffer *buf);
77int wait_on_tree_block_writeback(struct btrfs_root *root,
78 struct extent_buffer *buf);
79int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid);
80u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len);
81void btrfs_csum_final(u32 crc, char *result);
82int btrfs_open_device(struct btrfs_device *dev);
83int btrfs_verify_block_csum(struct btrfs_root *root,
84 struct extent_buffer *buf);
85int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
86 int metadata);
87int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
88 int rw, struct bio *bio, int mirror_num,
89 unsigned long bio_flags,
90 extent_submit_bio_hook_t *submit_bio_start,
91 extent_submit_bio_hook_t *submit_bio_done);
92
93int btrfs_congested_async(struct btrfs_fs_info *info, int iodone);
94unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info);
95int btrfs_write_tree_block(struct extent_buffer *buf);
96int btrfs_wait_tree_block_writeback(struct extent_buffer *buf);
97int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
98 struct btrfs_fs_info *fs_info);
99int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
100 struct btrfs_fs_info *fs_info);
101int btree_lock_page_hook(struct page *page);
102#endif
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
new file mode 100644
index 000000000000..85315d2c90de
--- /dev/null
+++ b/fs/btrfs/export.c
@@ -0,0 +1,203 @@
1#include <linux/fs.h>
2#include <linux/types.h>
3#include "ctree.h"
4#include "disk-io.h"
5#include "btrfs_inode.h"
6#include "print-tree.h"
7#include "export.h"
8#include "compat.h"
9
10#define BTRFS_FID_SIZE_NON_CONNECTABLE (offsetof(struct btrfs_fid, \
11 parent_objectid) / 4)
12#define BTRFS_FID_SIZE_CONNECTABLE (offsetof(struct btrfs_fid, \
13 parent_root_objectid) / 4)
14#define BTRFS_FID_SIZE_CONNECTABLE_ROOT (sizeof(struct btrfs_fid) / 4)
15
16static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
17 int connectable)
18{
19 struct btrfs_fid *fid = (struct btrfs_fid *)fh;
20 struct inode *inode = dentry->d_inode;
21 int len = *max_len;
22 int type;
23
24 if ((len < BTRFS_FID_SIZE_NON_CONNECTABLE) ||
25 (connectable && len < BTRFS_FID_SIZE_CONNECTABLE))
26 return 255;
27
28 len = BTRFS_FID_SIZE_NON_CONNECTABLE;
29 type = FILEID_BTRFS_WITHOUT_PARENT;
30
31 fid->objectid = BTRFS_I(inode)->location.objectid;
32 fid->root_objectid = BTRFS_I(inode)->root->objectid;
33 fid->gen = inode->i_generation;
34
35 if (connectable && !S_ISDIR(inode->i_mode)) {
36 struct inode *parent;
37 u64 parent_root_id;
38
39 spin_lock(&dentry->d_lock);
40
41 parent = dentry->d_parent->d_inode;
42 fid->parent_objectid = BTRFS_I(parent)->location.objectid;
43 fid->parent_gen = parent->i_generation;
44 parent_root_id = BTRFS_I(parent)->root->objectid;
45
46 spin_unlock(&dentry->d_lock);
47
48 if (parent_root_id != fid->root_objectid) {
49 fid->parent_root_objectid = parent_root_id;
50 len = BTRFS_FID_SIZE_CONNECTABLE_ROOT;
51 type = FILEID_BTRFS_WITH_PARENT_ROOT;
52 } else {
53 len = BTRFS_FID_SIZE_CONNECTABLE;
54 type = FILEID_BTRFS_WITH_PARENT;
55 }
56 }
57
58 *max_len = len;
59 return type;
60}
61
62static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
63 u64 root_objectid, u32 generation)
64{
65 struct btrfs_root *root;
66 struct inode *inode;
67 struct btrfs_key key;
68
69 key.objectid = root_objectid;
70 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
71 key.offset = (u64)-1;
72
73 root = btrfs_read_fs_root_no_name(btrfs_sb(sb)->fs_info, &key);
74 if (IS_ERR(root))
75 return ERR_CAST(root);
76
77 key.objectid = objectid;
78 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
79 key.offset = 0;
80
81 inode = btrfs_iget(sb, &key, root, NULL);
82 if (IS_ERR(inode))
83 return (void *)inode;
84
85 if (generation != inode->i_generation) {
86 iput(inode);
87 return ERR_PTR(-ESTALE);
88 }
89
90 return d_obtain_alias(inode);
91}
92
93static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh,
94 int fh_len, int fh_type)
95{
96 struct btrfs_fid *fid = (struct btrfs_fid *) fh;
97 u64 objectid, root_objectid;
98 u32 generation;
99
100 if (fh_type == FILEID_BTRFS_WITH_PARENT) {
101 if (fh_len != BTRFS_FID_SIZE_CONNECTABLE)
102 return NULL;
103 root_objectid = fid->root_objectid;
104 } else if (fh_type == FILEID_BTRFS_WITH_PARENT_ROOT) {
105 if (fh_len != BTRFS_FID_SIZE_CONNECTABLE_ROOT)
106 return NULL;
107 root_objectid = fid->parent_root_objectid;
108 } else
109 return NULL;
110
111 objectid = fid->parent_objectid;
112 generation = fid->parent_gen;
113
114 return btrfs_get_dentry(sb, objectid, root_objectid, generation);
115}
116
117static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
118 int fh_len, int fh_type)
119{
120 struct btrfs_fid *fid = (struct btrfs_fid *) fh;
121 u64 objectid, root_objectid;
122 u32 generation;
123
124 if ((fh_type != FILEID_BTRFS_WITH_PARENT ||
125 fh_len != BTRFS_FID_SIZE_CONNECTABLE) &&
126 (fh_type != FILEID_BTRFS_WITH_PARENT_ROOT ||
127 fh_len != BTRFS_FID_SIZE_CONNECTABLE_ROOT) &&
128 (fh_type != FILEID_BTRFS_WITHOUT_PARENT ||
129 fh_len != BTRFS_FID_SIZE_NON_CONNECTABLE))
130 return NULL;
131
132 objectid = fid->objectid;
133 root_objectid = fid->root_objectid;
134 generation = fid->gen;
135
136 return btrfs_get_dentry(sb, objectid, root_objectid, generation);
137}
138
139static struct dentry *btrfs_get_parent(struct dentry *child)
140{
141 struct inode *dir = child->d_inode;
142 struct btrfs_root *root = BTRFS_I(dir)->root;
143 struct btrfs_key key;
144 struct btrfs_path *path;
145 struct extent_buffer *leaf;
146 int slot;
147 u64 objectid;
148 int ret;
149
150 path = btrfs_alloc_path();
151
152 key.objectid = dir->i_ino;
153 btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
154 key.offset = (u64)-1;
155
156 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
157 if (ret < 0) {
158 /* Error */
159 btrfs_free_path(path);
160 return ERR_PTR(ret);
161 }
162 leaf = path->nodes[0];
163 slot = path->slots[0];
164 if (ret) {
165 /* btrfs_search_slot() returns the slot where we'd want to
166 insert a backref for parent inode #0xFFFFFFFFFFFFFFFF.
167 The _real_ backref, telling us what the parent inode
168 _actually_ is, will be in the slot _before_ the one
169 that btrfs_search_slot() returns. */
170 if (!slot) {
171 /* Unless there is _no_ key in the tree before... */
172 btrfs_free_path(path);
173 return ERR_PTR(-EIO);
174 }
175 slot--;
176 }
177
178 btrfs_item_key_to_cpu(leaf, &key, slot);
179 btrfs_free_path(path);
180
181 if (key.objectid != dir->i_ino || key.type != BTRFS_INODE_REF_KEY)
182 return ERR_PTR(-EINVAL);
183
184 objectid = key.offset;
185
186 /* If we are already at the root of a subvol, return the real root */
187 if (objectid == dir->i_ino)
188 return dget(dir->i_sb->s_root);
189
190 /* Build a new key for the inode item */
191 key.objectid = objectid;
192 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
193 key.offset = 0;
194
195 return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL));
196}
197
198const struct export_operations btrfs_export_ops = {
199 .encode_fh = btrfs_encode_fh,
200 .fh_to_dentry = btrfs_fh_to_dentry,
201 .fh_to_parent = btrfs_fh_to_parent,
202 .get_parent = btrfs_get_parent,
203};
diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h
new file mode 100644
index 000000000000..074348a95841
--- /dev/null
+++ b/fs/btrfs/export.h
@@ -0,0 +1,19 @@
1#ifndef BTRFS_EXPORT_H
2#define BTRFS_EXPORT_H
3
4#include <linux/exportfs.h>
5
6extern const struct export_operations btrfs_export_ops;
7
8struct btrfs_fid {
9 u64 objectid;
10 u64 root_objectid;
11 u32 gen;
12
13 u64 parent_objectid;
14 u32 parent_gen;
15
16 u64 parent_root_objectid;
17} __attribute__ ((packed));
18
19#endif
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
new file mode 100644
index 000000000000..293da650873f
--- /dev/null
+++ b/fs/btrfs/extent-tree.c
@@ -0,0 +1,5986 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18#include <linux/sched.h>
19#include <linux/pagemap.h>
20#include <linux/writeback.h>
21#include <linux/blkdev.h>
22#include <linux/version.h>
23#include "compat.h"
24#include "hash.h"
25#include "crc32c.h"
26#include "ctree.h"
27#include "disk-io.h"
28#include "print-tree.h"
29#include "transaction.h"
30#include "volumes.h"
31#include "locking.h"
32#include "ref-cache.h"
33#include "compat.h"
34
35#define PENDING_EXTENT_INSERT 0
36#define PENDING_EXTENT_DELETE 1
37#define PENDING_BACKREF_UPDATE 2
38
39struct pending_extent_op {
40 int type;
41 u64 bytenr;
42 u64 num_bytes;
43 u64 parent;
44 u64 orig_parent;
45 u64 generation;
46 u64 orig_generation;
47 int level;
48 struct list_head list;
49 int del;
50};
51
52static int finish_current_insert(struct btrfs_trans_handle *trans,
53 struct btrfs_root *extent_root, int all);
54static int del_pending_extents(struct btrfs_trans_handle *trans,
55 struct btrfs_root *extent_root, int all);
56static int pin_down_bytes(struct btrfs_trans_handle *trans,
57 struct btrfs_root *root,
58 u64 bytenr, u64 num_bytes, int is_data);
59static int update_block_group(struct btrfs_trans_handle *trans,
60 struct btrfs_root *root,
61 u64 bytenr, u64 num_bytes, int alloc,
62 int mark_free);
63
64static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
65{
66 return (cache->flags & bits) == bits;
67}
68
69/*
70 * this adds the block group to the fs_info rb tree for the block group
71 * cache
72 */
73static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
74 struct btrfs_block_group_cache *block_group)
75{
76 struct rb_node **p;
77 struct rb_node *parent = NULL;
78 struct btrfs_block_group_cache *cache;
79
80 spin_lock(&info->block_group_cache_lock);
81 p = &info->block_group_cache_tree.rb_node;
82
83 while (*p) {
84 parent = *p;
85 cache = rb_entry(parent, struct btrfs_block_group_cache,
86 cache_node);
87 if (block_group->key.objectid < cache->key.objectid) {
88 p = &(*p)->rb_left;
89 } else if (block_group->key.objectid > cache->key.objectid) {
90 p = &(*p)->rb_right;
91 } else {
92 spin_unlock(&info->block_group_cache_lock);
93 return -EEXIST;
94 }
95 }
96
97 rb_link_node(&block_group->cache_node, parent, p);
98 rb_insert_color(&block_group->cache_node,
99 &info->block_group_cache_tree);
100 spin_unlock(&info->block_group_cache_lock);
101
102 return 0;
103}
104
105/*
106 * This will return the block group at or after bytenr if contains is 0, else
107 * it will return the block group that contains the bytenr
108 */
109static struct btrfs_block_group_cache *
110block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
111 int contains)
112{
113 struct btrfs_block_group_cache *cache, *ret = NULL;
114 struct rb_node *n;
115 u64 end, start;
116
117 spin_lock(&info->block_group_cache_lock);
118 n = info->block_group_cache_tree.rb_node;
119
120 while (n) {
121 cache = rb_entry(n, struct btrfs_block_group_cache,
122 cache_node);
123 end = cache->key.objectid + cache->key.offset - 1;
124 start = cache->key.objectid;
125
126 if (bytenr < start) {
127 if (!contains && (!ret || start < ret->key.objectid))
128 ret = cache;
129 n = n->rb_left;
130 } else if (bytenr > start) {
131 if (contains && bytenr <= end) {
132 ret = cache;
133 break;
134 }
135 n = n->rb_right;
136 } else {
137 ret = cache;
138 break;
139 }
140 }
141 if (ret)
142 atomic_inc(&ret->count);
143 spin_unlock(&info->block_group_cache_lock);
144
145 return ret;
146}
147
148/*
149 * this is only called by cache_block_group, since we could have freed extents
150 * we need to check the pinned_extents for any extents that can't be used yet
151 * since their free space will be released as soon as the transaction commits.
152 */
153static int add_new_free_space(struct btrfs_block_group_cache *block_group,
154 struct btrfs_fs_info *info, u64 start, u64 end)
155{
156 u64 extent_start, extent_end, size;
157 int ret;
158
159 mutex_lock(&info->pinned_mutex);
160 while (start < end) {
161 ret = find_first_extent_bit(&info->pinned_extents, start,
162 &extent_start, &extent_end,
163 EXTENT_DIRTY);
164 if (ret)
165 break;
166
167 if (extent_start == start) {
168 start = extent_end + 1;
169 } else if (extent_start > start && extent_start < end) {
170 size = extent_start - start;
171 ret = btrfs_add_free_space(block_group, start,
172 size);
173 BUG_ON(ret);
174 start = extent_end + 1;
175 } else {
176 break;
177 }
178 }
179
180 if (start < end) {
181 size = end - start;
182 ret = btrfs_add_free_space(block_group, start, size);
183 BUG_ON(ret);
184 }
185 mutex_unlock(&info->pinned_mutex);
186
187 return 0;
188}
189
190static int remove_sb_from_cache(struct btrfs_root *root,
191 struct btrfs_block_group_cache *cache)
192{
193 u64 bytenr;
194 u64 *logical;
195 int stripe_len;
196 int i, nr, ret;
197
198 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
199 bytenr = btrfs_sb_offset(i);
200 ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
201 cache->key.objectid, bytenr, 0,
202 &logical, &nr, &stripe_len);
203 BUG_ON(ret);
204 while (nr--) {
205 btrfs_remove_free_space(cache, logical[nr],
206 stripe_len);
207 }
208 kfree(logical);
209 }
210 return 0;
211}
212
213static int cache_block_group(struct btrfs_root *root,
214 struct btrfs_block_group_cache *block_group)
215{
216 struct btrfs_path *path;
217 int ret = 0;
218 struct btrfs_key key;
219 struct extent_buffer *leaf;
220 int slot;
221 u64 last;
222
223 if (!block_group)
224 return 0;
225
226 root = root->fs_info->extent_root;
227
228 if (block_group->cached)
229 return 0;
230
231 path = btrfs_alloc_path();
232 if (!path)
233 return -ENOMEM;
234
235 path->reada = 2;
236 /*
237 * we get into deadlocks with paths held by callers of this function.
238 * since the alloc_mutex is protecting things right now, just
239 * skip the locking here
240 */
241 path->skip_locking = 1;
242 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
243 key.objectid = last;
244 key.offset = 0;
245 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
246 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
247 if (ret < 0)
248 goto err;
249
250 while (1) {
251 leaf = path->nodes[0];
252 slot = path->slots[0];
253 if (slot >= btrfs_header_nritems(leaf)) {
254 ret = btrfs_next_leaf(root, path);
255 if (ret < 0)
256 goto err;
257 if (ret == 0)
258 continue;
259 else
260 break;
261 }
262 btrfs_item_key_to_cpu(leaf, &key, slot);
263 if (key.objectid < block_group->key.objectid)
264 goto next;
265
266 if (key.objectid >= block_group->key.objectid +
267 block_group->key.offset)
268 break;
269
270 if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) {
271 add_new_free_space(block_group, root->fs_info, last,
272 key.objectid);
273
274 last = key.objectid + key.offset;
275 }
276next:
277 path->slots[0]++;
278 }
279
280 add_new_free_space(block_group, root->fs_info, last,
281 block_group->key.objectid +
282 block_group->key.offset);
283
284 remove_sb_from_cache(root, block_group);
285 block_group->cached = 1;
286 ret = 0;
287err:
288 btrfs_free_path(path);
289 return ret;
290}
291
292/*
293 * return the block group that starts at or after bytenr
294 */
295static struct btrfs_block_group_cache *
296btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
297{
298 struct btrfs_block_group_cache *cache;
299
300 cache = block_group_cache_tree_search(info, bytenr, 0);
301
302 return cache;
303}
304
305/*
306 * return the block group that contains teh given bytenr
307 */
308struct btrfs_block_group_cache *btrfs_lookup_block_group(
309 struct btrfs_fs_info *info,
310 u64 bytenr)
311{
312 struct btrfs_block_group_cache *cache;
313
314 cache = block_group_cache_tree_search(info, bytenr, 1);
315
316 return cache;
317}
318
319static inline void put_block_group(struct btrfs_block_group_cache *cache)
320{
321 if (atomic_dec_and_test(&cache->count))
322 kfree(cache);
323}
324
325static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
326 u64 flags)
327{
328 struct list_head *head = &info->space_info;
329 struct list_head *cur;
330 struct btrfs_space_info *found;
331 list_for_each(cur, head) {
332 found = list_entry(cur, struct btrfs_space_info, list);
333 if (found->flags == flags)
334 return found;
335 }
336 return NULL;
337}
338
339static u64 div_factor(u64 num, int factor)
340{
341 if (factor == 10)
342 return num;
343 num *= factor;
344 do_div(num, 10);
345 return num;
346}
347
348u64 btrfs_find_block_group(struct btrfs_root *root,
349 u64 search_start, u64 search_hint, int owner)
350{
351 struct btrfs_block_group_cache *cache;
352 u64 used;
353 u64 last = max(search_hint, search_start);
354 u64 group_start = 0;
355 int full_search = 0;
356 int factor = 9;
357 int wrapped = 0;
358again:
359 while (1) {
360 cache = btrfs_lookup_first_block_group(root->fs_info, last);
361 if (!cache)
362 break;
363
364 spin_lock(&cache->lock);
365 last = cache->key.objectid + cache->key.offset;
366 used = btrfs_block_group_used(&cache->item);
367
368 if ((full_search || !cache->ro) &&
369 block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) {
370 if (used + cache->pinned + cache->reserved <
371 div_factor(cache->key.offset, factor)) {
372 group_start = cache->key.objectid;
373 spin_unlock(&cache->lock);
374 put_block_group(cache);
375 goto found;
376 }
377 }
378 spin_unlock(&cache->lock);
379 put_block_group(cache);
380 cond_resched();
381 }
382 if (!wrapped) {
383 last = search_start;
384 wrapped = 1;
385 goto again;
386 }
387 if (!full_search && factor < 10) {
388 last = search_start;
389 full_search = 1;
390 factor = 10;
391 goto again;
392 }
393found:
394 return group_start;
395}
396
397/* simple helper to search for an existing extent at a given offset */
398int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
399{
400 int ret;
401 struct btrfs_key key;
402 struct btrfs_path *path;
403
404 path = btrfs_alloc_path();
405 BUG_ON(!path);
406 key.objectid = start;
407 key.offset = len;
408 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
409 ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
410 0, 0);
411 btrfs_free_path(path);
412 return ret;
413}
414
415/*
416 * Back reference rules. Back refs have three main goals:
417 *
418 * 1) differentiate between all holders of references to an extent so that
419 * when a reference is dropped we can make sure it was a valid reference
420 * before freeing the extent.
421 *
422 * 2) Provide enough information to quickly find the holders of an extent
423 * if we notice a given block is corrupted or bad.
424 *
425 * 3) Make it easy to migrate blocks for FS shrinking or storage pool
426 * maintenance. This is actually the same as #2, but with a slightly
427 * different use case.
428 *
429 * File extents can be referenced by:
430 *
431 * - multiple snapshots, subvolumes, or different generations in one subvol
432 * - different files inside a single subvolume
433 * - different offsets inside a file (bookend extents in file.c)
434 *
435 * The extent ref structure has fields for:
436 *
437 * - Objectid of the subvolume root
438 * - Generation number of the tree holding the reference
439 * - objectid of the file holding the reference
440 * - number of references holding by parent node (alway 1 for tree blocks)
441 *
442 * Btree leaf may hold multiple references to a file extent. In most cases,
443 * these references are from same file and the corresponding offsets inside
444 * the file are close together.
445 *
446 * When a file extent is allocated the fields are filled in:
447 * (root_key.objectid, trans->transid, inode objectid, 1)
448 *
449 * When a leaf is cow'd new references are added for every file extent found
450 * in the leaf. It looks similar to the create case, but trans->transid will
451 * be different when the block is cow'd.
452 *
453 * (root_key.objectid, trans->transid, inode objectid,
454 * number of references in the leaf)
455 *
456 * When a file extent is removed either during snapshot deletion or
457 * file truncation, we find the corresponding back reference and check
458 * the following fields:
459 *
460 * (btrfs_header_owner(leaf), btrfs_header_generation(leaf),
461 * inode objectid)
462 *
463 * Btree extents can be referenced by:
464 *
465 * - Different subvolumes
466 * - Different generations of the same subvolume
467 *
468 * When a tree block is created, back references are inserted:
469 *
470 * (root->root_key.objectid, trans->transid, level, 1)
471 *
472 * When a tree block is cow'd, new back references are added for all the
473 * blocks it points to. If the tree block isn't in reference counted root,
474 * the old back references are removed. These new back references are of
475 * the form (trans->transid will have increased since creation):
476 *
477 * (root->root_key.objectid, trans->transid, level, 1)
478 *
479 * When a backref is in deleting, the following fields are checked:
480 *
481 * if backref was for a tree root:
482 * (btrfs_header_owner(itself), btrfs_header_generation(itself), level)
483 * else
484 * (btrfs_header_owner(parent), btrfs_header_generation(parent), level)
485 *
486 * Back Reference Key composing:
487 *
488 * The key objectid corresponds to the first byte in the extent, the key
489 * type is set to BTRFS_EXTENT_REF_KEY, and the key offset is the first
490 * byte of parent extent. If a extent is tree root, the key offset is set
491 * to the key objectid.
492 */
493
494static noinline int lookup_extent_backref(struct btrfs_trans_handle *trans,
495 struct btrfs_root *root,
496 struct btrfs_path *path,
497 u64 bytenr, u64 parent,
498 u64 ref_root, u64 ref_generation,
499 u64 owner_objectid, int del)
500{
501 struct btrfs_key key;
502 struct btrfs_extent_ref *ref;
503 struct extent_buffer *leaf;
504 u64 ref_objectid;
505 int ret;
506
507 key.objectid = bytenr;
508 key.type = BTRFS_EXTENT_REF_KEY;
509 key.offset = parent;
510
511 ret = btrfs_search_slot(trans, root, &key, path, del ? -1 : 0, 1);
512 if (ret < 0)
513 goto out;
514 if (ret > 0) {
515 ret = -ENOENT;
516 goto out;
517 }
518
519 leaf = path->nodes[0];
520 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
521 ref_objectid = btrfs_ref_objectid(leaf, ref);
522 if (btrfs_ref_root(leaf, ref) != ref_root ||
523 btrfs_ref_generation(leaf, ref) != ref_generation ||
524 (ref_objectid != owner_objectid &&
525 ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) {
526 ret = -EIO;
527 WARN_ON(1);
528 goto out;
529 }
530 ret = 0;
531out:
532 return ret;
533}
534
535/*
536 * updates all the backrefs that are pending on update_list for the
537 * extent_root
538 */
539static noinline int update_backrefs(struct btrfs_trans_handle *trans,
540 struct btrfs_root *extent_root,
541 struct btrfs_path *path,
542 struct list_head *update_list)
543{
544 struct btrfs_key key;
545 struct btrfs_extent_ref *ref;
546 struct btrfs_fs_info *info = extent_root->fs_info;
547 struct pending_extent_op *op;
548 struct extent_buffer *leaf;
549 int ret = 0;
550 struct list_head *cur = update_list->next;
551 u64 ref_objectid;
552 u64 ref_root = extent_root->root_key.objectid;
553
554 op = list_entry(cur, struct pending_extent_op, list);
555
556search:
557 key.objectid = op->bytenr;
558 key.type = BTRFS_EXTENT_REF_KEY;
559 key.offset = op->orig_parent;
560
561 ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 1);
562 BUG_ON(ret);
563
564 leaf = path->nodes[0];
565
566loop:
567 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
568
569 ref_objectid = btrfs_ref_objectid(leaf, ref);
570
571 if (btrfs_ref_root(leaf, ref) != ref_root ||
572 btrfs_ref_generation(leaf, ref) != op->orig_generation ||
573 (ref_objectid != op->level &&
574 ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) {
575 printk(KERN_ERR "btrfs couldn't find %llu, parent %llu, "
576 "root %llu, owner %u\n",
577 (unsigned long long)op->bytenr,
578 (unsigned long long)op->orig_parent,
579 (unsigned long long)ref_root, op->level);
580 btrfs_print_leaf(extent_root, leaf);
581 BUG();
582 }
583
584 key.objectid = op->bytenr;
585 key.offset = op->parent;
586 key.type = BTRFS_EXTENT_REF_KEY;
587 ret = btrfs_set_item_key_safe(trans, extent_root, path, &key);
588 BUG_ON(ret);
589 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
590 btrfs_set_ref_generation(leaf, ref, op->generation);
591
592 cur = cur->next;
593
594 list_del_init(&op->list);
595 unlock_extent(&info->extent_ins, op->bytenr,
596 op->bytenr + op->num_bytes - 1, GFP_NOFS);
597 kfree(op);
598
599 if (cur == update_list) {
600 btrfs_mark_buffer_dirty(path->nodes[0]);
601 btrfs_release_path(extent_root, path);
602 goto out;
603 }
604
605 op = list_entry(cur, struct pending_extent_op, list);
606
607 path->slots[0]++;
608 while (path->slots[0] < btrfs_header_nritems(leaf)) {
609 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
610 if (key.objectid == op->bytenr &&
611 key.type == BTRFS_EXTENT_REF_KEY)
612 goto loop;
613 path->slots[0]++;
614 }
615
616 btrfs_mark_buffer_dirty(path->nodes[0]);
617 btrfs_release_path(extent_root, path);
618 goto search;
619
620out:
621 return 0;
622}
623
624static noinline int insert_extents(struct btrfs_trans_handle *trans,
625 struct btrfs_root *extent_root,
626 struct btrfs_path *path,
627 struct list_head *insert_list, int nr)
628{
629 struct btrfs_key *keys;
630 u32 *data_size;
631 struct pending_extent_op *op;
632 struct extent_buffer *leaf;
633 struct list_head *cur = insert_list->next;
634 struct btrfs_fs_info *info = extent_root->fs_info;
635 u64 ref_root = extent_root->root_key.objectid;
636 int i = 0, last = 0, ret;
637 int total = nr * 2;
638
639 if (!nr)
640 return 0;
641
642 keys = kzalloc(total * sizeof(struct btrfs_key), GFP_NOFS);
643 if (!keys)
644 return -ENOMEM;
645
646 data_size = kzalloc(total * sizeof(u32), GFP_NOFS);
647 if (!data_size) {
648 kfree(keys);
649 return -ENOMEM;
650 }
651
652 list_for_each_entry(op, insert_list, list) {
653 keys[i].objectid = op->bytenr;
654 keys[i].offset = op->num_bytes;
655 keys[i].type = BTRFS_EXTENT_ITEM_KEY;
656 data_size[i] = sizeof(struct btrfs_extent_item);
657 i++;
658
659 keys[i].objectid = op->bytenr;
660 keys[i].offset = op->parent;
661 keys[i].type = BTRFS_EXTENT_REF_KEY;
662 data_size[i] = sizeof(struct btrfs_extent_ref);
663 i++;
664 }
665
666 op = list_entry(cur, struct pending_extent_op, list);
667 i = 0;
668 while (i < total) {
669 int c;
670 ret = btrfs_insert_some_items(trans, extent_root, path,
671 keys+i, data_size+i, total-i);
672 BUG_ON(ret < 0);
673
674 if (last && ret > 1)
675 BUG();
676
677 leaf = path->nodes[0];
678 for (c = 0; c < ret; c++) {
679 int ref_first = keys[i].type == BTRFS_EXTENT_REF_KEY;
680
681 /*
682 * if the first item we inserted was a backref, then
683 * the EXTENT_ITEM will be the odd c's, else it will
684 * be the even c's
685 */
686 if ((ref_first && (c % 2)) ||
687 (!ref_first && !(c % 2))) {
688 struct btrfs_extent_item *itm;
689
690 itm = btrfs_item_ptr(leaf, path->slots[0] + c,
691 struct btrfs_extent_item);
692 btrfs_set_extent_refs(path->nodes[0], itm, 1);
693 op->del++;
694 } else {
695 struct btrfs_extent_ref *ref;
696
697 ref = btrfs_item_ptr(leaf, path->slots[0] + c,
698 struct btrfs_extent_ref);
699 btrfs_set_ref_root(leaf, ref, ref_root);
700 btrfs_set_ref_generation(leaf, ref,
701 op->generation);
702 btrfs_set_ref_objectid(leaf, ref, op->level);
703 btrfs_set_ref_num_refs(leaf, ref, 1);
704 op->del++;
705 }
706
707 /*
708 * using del to see when its ok to free up the
709 * pending_extent_op. In the case where we insert the
710 * last item on the list in order to help do batching
711 * we need to not free the extent op until we actually
712 * insert the extent_item
713 */
714 if (op->del == 2) {
715 unlock_extent(&info->extent_ins, op->bytenr,
716 op->bytenr + op->num_bytes - 1,
717 GFP_NOFS);
718 cur = cur->next;
719 list_del_init(&op->list);
720 kfree(op);
721 if (cur != insert_list)
722 op = list_entry(cur,
723 struct pending_extent_op,
724 list);
725 }
726 }
727 btrfs_mark_buffer_dirty(leaf);
728 btrfs_release_path(extent_root, path);
729
730 /*
731 * Ok backref's and items usually go right next to eachother,
732 * but if we could only insert 1 item that means that we
733 * inserted on the end of a leaf, and we have no idea what may
734 * be on the next leaf so we just play it safe. In order to
735 * try and help this case we insert the last thing on our
736 * insert list so hopefully it will end up being the last
737 * thing on the leaf and everything else will be before it,
738 * which will let us insert a whole bunch of items at the same
739 * time.
740 */
741 if (ret == 1 && !last && (i + ret < total)) {
742 /*
743 * last: where we will pick up the next time around
744 * i: our current key to insert, will be total - 1
745 * cur: the current op we are screwing with
746 * op: duh
747 */
748 last = i + ret;
749 i = total - 1;
750 cur = insert_list->prev;
751 op = list_entry(cur, struct pending_extent_op, list);
752 } else if (last) {
753 /*
754 * ok we successfully inserted the last item on the
755 * list, lets reset everything
756 *
757 * i: our current key to insert, so where we left off
758 * last time
759 * last: done with this
760 * cur: the op we are messing with
761 * op: duh
762 * total: since we inserted the last key, we need to
763 * decrement total so we dont overflow
764 */
765 i = last;
766 last = 0;
767 total--;
768 if (i < total) {
769 cur = insert_list->next;
770 op = list_entry(cur, struct pending_extent_op,
771 list);
772 }
773 } else {
774 i += ret;
775 }
776
777 cond_resched();
778 }
779 ret = 0;
780 kfree(keys);
781 kfree(data_size);
782 return ret;
783}
784
785static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
786 struct btrfs_root *root,
787 struct btrfs_path *path,
788 u64 bytenr, u64 parent,
789 u64 ref_root, u64 ref_generation,
790 u64 owner_objectid)
791{
792 struct btrfs_key key;
793 struct extent_buffer *leaf;
794 struct btrfs_extent_ref *ref;
795 u32 num_refs;
796 int ret;
797
798 key.objectid = bytenr;
799 key.type = BTRFS_EXTENT_REF_KEY;
800 key.offset = parent;
801
802 ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*ref));
803 if (ret == 0) {
804 leaf = path->nodes[0];
805 ref = btrfs_item_ptr(leaf, path->slots[0],
806 struct btrfs_extent_ref);
807 btrfs_set_ref_root(leaf, ref, ref_root);
808 btrfs_set_ref_generation(leaf, ref, ref_generation);
809 btrfs_set_ref_objectid(leaf, ref, owner_objectid);
810 btrfs_set_ref_num_refs(leaf, ref, 1);
811 } else if (ret == -EEXIST) {
812 u64 existing_owner;
813 BUG_ON(owner_objectid < BTRFS_FIRST_FREE_OBJECTID);
814 leaf = path->nodes[0];
815 ref = btrfs_item_ptr(leaf, path->slots[0],
816 struct btrfs_extent_ref);
817 if (btrfs_ref_root(leaf, ref) != ref_root ||
818 btrfs_ref_generation(leaf, ref) != ref_generation) {
819 ret = -EIO;
820 WARN_ON(1);
821 goto out;
822 }
823
824 num_refs = btrfs_ref_num_refs(leaf, ref);
825 BUG_ON(num_refs == 0);
826 btrfs_set_ref_num_refs(leaf, ref, num_refs + 1);
827
828 existing_owner = btrfs_ref_objectid(leaf, ref);
829 if (existing_owner != owner_objectid &&
830 existing_owner != BTRFS_MULTIPLE_OBJECTIDS) {
831 btrfs_set_ref_objectid(leaf, ref,
832 BTRFS_MULTIPLE_OBJECTIDS);
833 }
834 ret = 0;
835 } else {
836 goto out;
837 }
838 btrfs_mark_buffer_dirty(path->nodes[0]);
839out:
840 btrfs_release_path(root, path);
841 return ret;
842}
843
844static noinline int remove_extent_backref(struct btrfs_trans_handle *trans,
845 struct btrfs_root *root,
846 struct btrfs_path *path)
847{
848 struct extent_buffer *leaf;
849 struct btrfs_extent_ref *ref;
850 u32 num_refs;
851 int ret = 0;
852
853 leaf = path->nodes[0];
854 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
855 num_refs = btrfs_ref_num_refs(leaf, ref);
856 BUG_ON(num_refs == 0);
857 num_refs -= 1;
858 if (num_refs == 0) {
859 ret = btrfs_del_item(trans, root, path);
860 } else {
861 btrfs_set_ref_num_refs(leaf, ref, num_refs);
862 btrfs_mark_buffer_dirty(leaf);
863 }
864 btrfs_release_path(root, path);
865 return ret;
866}
867
868#ifdef BIO_RW_DISCARD
869static void btrfs_issue_discard(struct block_device *bdev,
870 u64 start, u64 len)
871{
872 blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL);
873}
874#endif
875
876static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
877 u64 num_bytes)
878{
879#ifdef BIO_RW_DISCARD
880 int ret;
881 u64 map_length = num_bytes;
882 struct btrfs_multi_bio *multi = NULL;
883
884 /* Tell the block device(s) that the sectors can be discarded */
885 ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
886 bytenr, &map_length, &multi, 0);
887 if (!ret) {
888 struct btrfs_bio_stripe *stripe = multi->stripes;
889 int i;
890
891 if (map_length > num_bytes)
892 map_length = num_bytes;
893
894 for (i = 0; i < multi->num_stripes; i++, stripe++) {
895 btrfs_issue_discard(stripe->dev->bdev,
896 stripe->physical,
897 map_length);
898 }
899 kfree(multi);
900 }
901
902 return ret;
903#else
904 return 0;
905#endif
906}
907
908static noinline int free_extents(struct btrfs_trans_handle *trans,
909 struct btrfs_root *extent_root,
910 struct list_head *del_list)
911{
912 struct btrfs_fs_info *info = extent_root->fs_info;
913 struct btrfs_path *path;
914 struct btrfs_key key, found_key;
915 struct extent_buffer *leaf;
916 struct list_head *cur;
917 struct pending_extent_op *op;
918 struct btrfs_extent_item *ei;
919 int ret, num_to_del, extent_slot = 0, found_extent = 0;
920 u32 refs;
921 u64 bytes_freed = 0;
922
923 path = btrfs_alloc_path();
924 if (!path)
925 return -ENOMEM;
926 path->reada = 1;
927
928search:
929 /* search for the backref for the current ref we want to delete */
930 cur = del_list->next;
931 op = list_entry(cur, struct pending_extent_op, list);
932 ret = lookup_extent_backref(trans, extent_root, path, op->bytenr,
933 op->orig_parent,
934 extent_root->root_key.objectid,
935 op->orig_generation, op->level, 1);
936 if (ret) {
937 printk(KERN_ERR "btrfs unable to find backref byte nr %llu "
938 "root %llu gen %llu owner %u\n",
939 (unsigned long long)op->bytenr,
940 (unsigned long long)extent_root->root_key.objectid,
941 (unsigned long long)op->orig_generation, op->level);
942 btrfs_print_leaf(extent_root, path->nodes[0]);
943 WARN_ON(1);
944 goto out;
945 }
946
947 extent_slot = path->slots[0];
948 num_to_del = 1;
949 found_extent = 0;
950
951 /*
952 * if we aren't the first item on the leaf we can move back one and see
953 * if our ref is right next to our extent item
954 */
955 if (likely(extent_slot)) {
956 extent_slot--;
957 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
958 extent_slot);
959 if (found_key.objectid == op->bytenr &&
960 found_key.type == BTRFS_EXTENT_ITEM_KEY &&
961 found_key.offset == op->num_bytes) {
962 num_to_del++;
963 found_extent = 1;
964 }
965 }
966
967 /*
968 * if we didn't find the extent we need to delete the backref and then
969 * search for the extent item key so we can update its ref count
970 */
971 if (!found_extent) {
972 key.objectid = op->bytenr;
973 key.type = BTRFS_EXTENT_ITEM_KEY;
974 key.offset = op->num_bytes;
975
976 ret = remove_extent_backref(trans, extent_root, path);
977 BUG_ON(ret);
978 btrfs_release_path(extent_root, path);
979 ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1);
980 BUG_ON(ret);
981 extent_slot = path->slots[0];
982 }
983
984 /* this is where we update the ref count for the extent */
985 leaf = path->nodes[0];
986 ei = btrfs_item_ptr(leaf, extent_slot, struct btrfs_extent_item);
987 refs = btrfs_extent_refs(leaf, ei);
988 BUG_ON(refs == 0);
989 refs--;
990 btrfs_set_extent_refs(leaf, ei, refs);
991
992 btrfs_mark_buffer_dirty(leaf);
993
994 /*
995 * This extent needs deleting. The reason cur_slot is extent_slot +
996 * num_to_del is because extent_slot points to the slot where the extent
997 * is, and if the backref was not right next to the extent we will be
998 * deleting at least 1 item, and will want to start searching at the
999 * slot directly next to extent_slot. However if we did find the
1000 * backref next to the extent item them we will be deleting at least 2
1001 * items and will want to start searching directly after the ref slot
1002 */
1003 if (!refs) {
1004 struct list_head *pos, *n, *end;
1005 int cur_slot = extent_slot+num_to_del;
1006 u64 super_used;
1007 u64 root_used;
1008
1009 path->slots[0] = extent_slot;
1010 bytes_freed = op->num_bytes;
1011
1012 mutex_lock(&info->pinned_mutex);
1013 ret = pin_down_bytes(trans, extent_root, op->bytenr,
1014 op->num_bytes, op->level >=
1015 BTRFS_FIRST_FREE_OBJECTID);
1016 mutex_unlock(&info->pinned_mutex);
1017 BUG_ON(ret < 0);
1018 op->del = ret;
1019
1020 /*
1021 * we need to see if we can delete multiple things at once, so
1022 * start looping through the list of extents we are wanting to
1023 * delete and see if their extent/backref's are right next to
1024 * eachother and the extents only have 1 ref
1025 */
1026 for (pos = cur->next; pos != del_list; pos = pos->next) {
1027 struct pending_extent_op *tmp;
1028
1029 tmp = list_entry(pos, struct pending_extent_op, list);
1030
1031 /* we only want to delete extent+ref at this stage */
1032 if (cur_slot >= btrfs_header_nritems(leaf) - 1)
1033 break;
1034
1035 btrfs_item_key_to_cpu(leaf, &found_key, cur_slot);
1036 if (found_key.objectid != tmp->bytenr ||
1037 found_key.type != BTRFS_EXTENT_ITEM_KEY ||
1038 found_key.offset != tmp->num_bytes)
1039 break;
1040
1041 /* check to make sure this extent only has one ref */
1042 ei = btrfs_item_ptr(leaf, cur_slot,
1043 struct btrfs_extent_item);
1044 if (btrfs_extent_refs(leaf, ei) != 1)
1045 break;
1046
1047 btrfs_item_key_to_cpu(leaf, &found_key, cur_slot+1);
1048 if (found_key.objectid != tmp->bytenr ||
1049 found_key.type != BTRFS_EXTENT_REF_KEY ||
1050 found_key.offset != tmp->orig_parent)
1051 break;
1052
1053 /*
1054 * the ref is right next to the extent, we can set the
1055 * ref count to 0 since we will delete them both now
1056 */
1057 btrfs_set_extent_refs(leaf, ei, 0);
1058
1059 /* pin down the bytes for this extent */
1060 mutex_lock(&info->pinned_mutex);
1061 ret = pin_down_bytes(trans, extent_root, tmp->bytenr,
1062 tmp->num_bytes, tmp->level >=
1063 BTRFS_FIRST_FREE_OBJECTID);
1064 mutex_unlock(&info->pinned_mutex);
1065 BUG_ON(ret < 0);
1066
1067 /*
1068 * use the del field to tell if we need to go ahead and
1069 * free up the extent when we delete the item or not.
1070 */
1071 tmp->del = ret;
1072 bytes_freed += tmp->num_bytes;
1073
1074 num_to_del += 2;
1075 cur_slot += 2;
1076 }
1077 end = pos;
1078
1079 /* update the free space counters */
1080 spin_lock(&info->delalloc_lock);
1081 super_used = btrfs_super_bytes_used(&info->super_copy);
1082 btrfs_set_super_bytes_used(&info->super_copy,
1083 super_used - bytes_freed);
1084
1085 root_used = btrfs_root_used(&extent_root->root_item);
1086 btrfs_set_root_used(&extent_root->root_item,
1087 root_used - bytes_freed);
1088 spin_unlock(&info->delalloc_lock);
1089
1090 /* delete the items */
1091 ret = btrfs_del_items(trans, extent_root, path,
1092 path->slots[0], num_to_del);
1093 BUG_ON(ret);
1094
1095 /*
1096 * loop through the extents we deleted and do the cleanup work
1097 * on them
1098 */
1099 for (pos = cur, n = pos->next; pos != end;
1100 pos = n, n = pos->next) {
1101 struct pending_extent_op *tmp;
1102 tmp = list_entry(pos, struct pending_extent_op, list);
1103
1104 /*
1105 * remember tmp->del tells us wether or not we pinned
1106 * down the extent
1107 */
1108 ret = update_block_group(trans, extent_root,
1109 tmp->bytenr, tmp->num_bytes, 0,
1110 tmp->del);
1111 BUG_ON(ret);
1112
1113 list_del_init(&tmp->list);
1114 unlock_extent(&info->extent_ins, tmp->bytenr,
1115 tmp->bytenr + tmp->num_bytes - 1,
1116 GFP_NOFS);
1117 kfree(tmp);
1118 }
1119 } else if (refs && found_extent) {
1120 /*
1121 * the ref and extent were right next to eachother, but the
1122 * extent still has a ref, so just free the backref and keep
1123 * going
1124 */
1125 ret = remove_extent_backref(trans, extent_root, path);
1126 BUG_ON(ret);
1127
1128 list_del_init(&op->list);
1129 unlock_extent(&info->extent_ins, op->bytenr,
1130 op->bytenr + op->num_bytes - 1, GFP_NOFS);
1131 kfree(op);
1132 } else {
1133 /*
1134 * the extent has multiple refs and the backref we were looking
1135 * for was not right next to it, so just unlock and go next,
1136 * we're good to go
1137 */
1138 list_del_init(&op->list);
1139 unlock_extent(&info->extent_ins, op->bytenr,
1140 op->bytenr + op->num_bytes - 1, GFP_NOFS);
1141 kfree(op);
1142 }
1143
1144 btrfs_release_path(extent_root, path);
1145 if (!list_empty(del_list))
1146 goto search;
1147
1148out:
1149 btrfs_free_path(path);
1150 return ret;
1151}
1152
1153static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
1154 struct btrfs_root *root, u64 bytenr,
1155 u64 orig_parent, u64 parent,
1156 u64 orig_root, u64 ref_root,
1157 u64 orig_generation, u64 ref_generation,
1158 u64 owner_objectid)
1159{
1160 int ret;
1161 struct btrfs_root *extent_root = root->fs_info->extent_root;
1162 struct btrfs_path *path;
1163
1164 if (root == root->fs_info->extent_root) {
1165 struct pending_extent_op *extent_op;
1166 u64 num_bytes;
1167
1168 BUG_ON(owner_objectid >= BTRFS_MAX_LEVEL);
1169 num_bytes = btrfs_level_size(root, (int)owner_objectid);
1170 mutex_lock(&root->fs_info->extent_ins_mutex);
1171 if (test_range_bit(&root->fs_info->extent_ins, bytenr,
1172 bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) {
1173 u64 priv;
1174 ret = get_state_private(&root->fs_info->extent_ins,
1175 bytenr, &priv);
1176 BUG_ON(ret);
1177 extent_op = (struct pending_extent_op *)
1178 (unsigned long)priv;
1179 BUG_ON(extent_op->parent != orig_parent);
1180 BUG_ON(extent_op->generation != orig_generation);
1181
1182 extent_op->parent = parent;
1183 extent_op->generation = ref_generation;
1184 } else {
1185 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
1186 BUG_ON(!extent_op);
1187
1188 extent_op->type = PENDING_BACKREF_UPDATE;
1189 extent_op->bytenr = bytenr;
1190 extent_op->num_bytes = num_bytes;
1191 extent_op->parent = parent;
1192 extent_op->orig_parent = orig_parent;
1193 extent_op->generation = ref_generation;
1194 extent_op->orig_generation = orig_generation;
1195 extent_op->level = (int)owner_objectid;
1196 INIT_LIST_HEAD(&extent_op->list);
1197 extent_op->del = 0;
1198
1199 set_extent_bits(&root->fs_info->extent_ins,
1200 bytenr, bytenr + num_bytes - 1,
1201 EXTENT_WRITEBACK, GFP_NOFS);
1202 set_state_private(&root->fs_info->extent_ins,
1203 bytenr, (unsigned long)extent_op);
1204 }
1205 mutex_unlock(&root->fs_info->extent_ins_mutex);
1206 return 0;
1207 }
1208
1209 path = btrfs_alloc_path();
1210 if (!path)
1211 return -ENOMEM;
1212 ret = lookup_extent_backref(trans, extent_root, path,
1213 bytenr, orig_parent, orig_root,
1214 orig_generation, owner_objectid, 1);
1215 if (ret)
1216 goto out;
1217 ret = remove_extent_backref(trans, extent_root, path);
1218 if (ret)
1219 goto out;
1220 ret = insert_extent_backref(trans, extent_root, path, bytenr,
1221 parent, ref_root, ref_generation,
1222 owner_objectid);
1223 BUG_ON(ret);
1224 finish_current_insert(trans, extent_root, 0);
1225 del_pending_extents(trans, extent_root, 0);
1226out:
1227 btrfs_free_path(path);
1228 return ret;
1229}
1230
1231int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
1232 struct btrfs_root *root, u64 bytenr,
1233 u64 orig_parent, u64 parent,
1234 u64 ref_root, u64 ref_generation,
1235 u64 owner_objectid)
1236{
1237 int ret;
1238 if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
1239 owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
1240 return 0;
1241 ret = __btrfs_update_extent_ref(trans, root, bytenr, orig_parent,
1242 parent, ref_root, ref_root,
1243 ref_generation, ref_generation,
1244 owner_objectid);
1245 return ret;
1246}
1247
1248static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1249 struct btrfs_root *root, u64 bytenr,
1250 u64 orig_parent, u64 parent,
1251 u64 orig_root, u64 ref_root,
1252 u64 orig_generation, u64 ref_generation,
1253 u64 owner_objectid)
1254{
1255 struct btrfs_path *path;
1256 int ret;
1257 struct btrfs_key key;
1258 struct extent_buffer *l;
1259 struct btrfs_extent_item *item;
1260 u32 refs;
1261
1262 path = btrfs_alloc_path();
1263 if (!path)
1264 return -ENOMEM;
1265
1266 path->reada = 1;
1267 key.objectid = bytenr;
1268 key.type = BTRFS_EXTENT_ITEM_KEY;
1269 key.offset = (u64)-1;
1270
1271 ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
1272 0, 1);
1273 if (ret < 0)
1274 return ret;
1275 BUG_ON(ret == 0 || path->slots[0] == 0);
1276
1277 path->slots[0]--;
1278 l = path->nodes[0];
1279
1280 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
1281 if (key.objectid != bytenr) {
1282 btrfs_print_leaf(root->fs_info->extent_root, path->nodes[0]);
1283 printk(KERN_ERR "btrfs wanted %llu found %llu\n",
1284 (unsigned long long)bytenr,
1285 (unsigned long long)key.objectid);
1286 BUG();
1287 }
1288 BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY);
1289
1290 item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
1291 refs = btrfs_extent_refs(l, item);
1292 btrfs_set_extent_refs(l, item, refs + 1);
1293 btrfs_mark_buffer_dirty(path->nodes[0]);
1294
1295 btrfs_release_path(root->fs_info->extent_root, path);
1296
1297 path->reada = 1;
1298 ret = insert_extent_backref(trans, root->fs_info->extent_root,
1299 path, bytenr, parent,
1300 ref_root, ref_generation,
1301 owner_objectid);
1302 BUG_ON(ret);
1303 finish_current_insert(trans, root->fs_info->extent_root, 0);
1304 del_pending_extents(trans, root->fs_info->extent_root, 0);
1305
1306 btrfs_free_path(path);
1307 return 0;
1308}
1309
1310int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1311 struct btrfs_root *root,
1312 u64 bytenr, u64 num_bytes, u64 parent,
1313 u64 ref_root, u64 ref_generation,
1314 u64 owner_objectid)
1315{
1316 int ret;
1317 if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
1318 owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
1319 return 0;
1320 ret = __btrfs_inc_extent_ref(trans, root, bytenr, 0, parent,
1321 0, ref_root, 0, ref_generation,
1322 owner_objectid);
1323 return ret;
1324}
1325
1326int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
1327 struct btrfs_root *root)
1328{
1329 finish_current_insert(trans, root->fs_info->extent_root, 1);
1330 del_pending_extents(trans, root->fs_info->extent_root, 1);
1331 return 0;
1332}
1333
1334int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
1335 struct btrfs_root *root, u64 bytenr,
1336 u64 num_bytes, u32 *refs)
1337{
1338 struct btrfs_path *path;
1339 int ret;
1340 struct btrfs_key key;
1341 struct extent_buffer *l;
1342 struct btrfs_extent_item *item;
1343
1344 WARN_ON(num_bytes < root->sectorsize);
1345 path = btrfs_alloc_path();
1346 path->reada = 1;
1347 key.objectid = bytenr;
1348 key.offset = num_bytes;
1349 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
1350 ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
1351 0, 0);
1352 if (ret < 0)
1353 goto out;
1354 if (ret != 0) {
1355 btrfs_print_leaf(root, path->nodes[0]);
1356 printk(KERN_INFO "btrfs failed to find block number %llu\n",
1357 (unsigned long long)bytenr);
1358 BUG();
1359 }
1360 l = path->nodes[0];
1361 item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
1362 *refs = btrfs_extent_refs(l, item);
1363out:
1364 btrfs_free_path(path);
1365 return 0;
1366}
1367
1368int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
1369 struct btrfs_root *root, u64 objectid, u64 bytenr)
1370{
1371 struct btrfs_root *extent_root = root->fs_info->extent_root;
1372 struct btrfs_path *path;
1373 struct extent_buffer *leaf;
1374 struct btrfs_extent_ref *ref_item;
1375 struct btrfs_key key;
1376 struct btrfs_key found_key;
1377 u64 ref_root;
1378 u64 last_snapshot;
1379 u32 nritems;
1380 int ret;
1381
1382 key.objectid = bytenr;
1383 key.offset = (u64)-1;
1384 key.type = BTRFS_EXTENT_ITEM_KEY;
1385
1386 path = btrfs_alloc_path();
1387 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
1388 if (ret < 0)
1389 goto out;
1390 BUG_ON(ret == 0);
1391
1392 ret = -ENOENT;
1393 if (path->slots[0] == 0)
1394 goto out;
1395
1396 path->slots[0]--;
1397 leaf = path->nodes[0];
1398 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1399
1400 if (found_key.objectid != bytenr ||
1401 found_key.type != BTRFS_EXTENT_ITEM_KEY)
1402 goto out;
1403
1404 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1405 while (1) {
1406 leaf = path->nodes[0];
1407 nritems = btrfs_header_nritems(leaf);
1408 if (path->slots[0] >= nritems) {
1409 ret = btrfs_next_leaf(extent_root, path);
1410 if (ret < 0)
1411 goto out;
1412 if (ret == 0)
1413 continue;
1414 break;
1415 }
1416 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1417 if (found_key.objectid != bytenr)
1418 break;
1419
1420 if (found_key.type != BTRFS_EXTENT_REF_KEY) {
1421 path->slots[0]++;
1422 continue;
1423 }
1424
1425 ref_item = btrfs_item_ptr(leaf, path->slots[0],
1426 struct btrfs_extent_ref);
1427 ref_root = btrfs_ref_root(leaf, ref_item);
1428 if ((ref_root != root->root_key.objectid &&
1429 ref_root != BTRFS_TREE_LOG_OBJECTID) ||
1430 objectid != btrfs_ref_objectid(leaf, ref_item)) {
1431 ret = 1;
1432 goto out;
1433 }
1434 if (btrfs_ref_generation(leaf, ref_item) <= last_snapshot) {
1435 ret = 1;
1436 goto out;
1437 }
1438
1439 path->slots[0]++;
1440 }
1441 ret = 0;
1442out:
1443 btrfs_free_path(path);
1444 return ret;
1445}
1446
1447int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1448 struct extent_buffer *buf, u32 nr_extents)
1449{
1450 struct btrfs_key key;
1451 struct btrfs_file_extent_item *fi;
1452 u64 root_gen;
1453 u32 nritems;
1454 int i;
1455 int level;
1456 int ret = 0;
1457 int shared = 0;
1458
1459 if (!root->ref_cows)
1460 return 0;
1461
1462 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
1463 shared = 0;
1464 root_gen = root->root_key.offset;
1465 } else {
1466 shared = 1;
1467 root_gen = trans->transid - 1;
1468 }
1469
1470 level = btrfs_header_level(buf);
1471 nritems = btrfs_header_nritems(buf);
1472
1473 if (level == 0) {
1474 struct btrfs_leaf_ref *ref;
1475 struct btrfs_extent_info *info;
1476
1477 ref = btrfs_alloc_leaf_ref(root, nr_extents);
1478 if (!ref) {
1479 ret = -ENOMEM;
1480 goto out;
1481 }
1482
1483 ref->root_gen = root_gen;
1484 ref->bytenr = buf->start;
1485 ref->owner = btrfs_header_owner(buf);
1486 ref->generation = btrfs_header_generation(buf);
1487 ref->nritems = nr_extents;
1488 info = ref->extents;
1489
1490 for (i = 0; nr_extents > 0 && i < nritems; i++) {
1491 u64 disk_bytenr;
1492 btrfs_item_key_to_cpu(buf, &key, i);
1493 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
1494 continue;
1495 fi = btrfs_item_ptr(buf, i,
1496 struct btrfs_file_extent_item);
1497 if (btrfs_file_extent_type(buf, fi) ==
1498 BTRFS_FILE_EXTENT_INLINE)
1499 continue;
1500 disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
1501 if (disk_bytenr == 0)
1502 continue;
1503
1504 info->bytenr = disk_bytenr;
1505 info->num_bytes =
1506 btrfs_file_extent_disk_num_bytes(buf, fi);
1507 info->objectid = key.objectid;
1508 info->offset = key.offset;
1509 info++;
1510 }
1511
1512 ret = btrfs_add_leaf_ref(root, ref, shared);
1513 if (ret == -EEXIST && shared) {
1514 struct btrfs_leaf_ref *old;
1515 old = btrfs_lookup_leaf_ref(root, ref->bytenr);
1516 BUG_ON(!old);
1517 btrfs_remove_leaf_ref(root, old);
1518 btrfs_free_leaf_ref(root, old);
1519 ret = btrfs_add_leaf_ref(root, ref, shared);
1520 }
1521 WARN_ON(ret);
1522 btrfs_free_leaf_ref(root, ref);
1523 }
1524out:
1525 return ret;
1526}
1527
1528int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1529 struct extent_buffer *orig_buf, struct extent_buffer *buf,
1530 u32 *nr_extents)
1531{
1532 u64 bytenr;
1533 u64 ref_root;
1534 u64 orig_root;
1535 u64 ref_generation;
1536 u64 orig_generation;
1537 u32 nritems;
1538 u32 nr_file_extents = 0;
1539 struct btrfs_key key;
1540 struct btrfs_file_extent_item *fi;
1541 int i;
1542 int level;
1543 int ret = 0;
1544 int faili = 0;
1545 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
1546 u64, u64, u64, u64, u64, u64, u64, u64);
1547
1548 ref_root = btrfs_header_owner(buf);
1549 ref_generation = btrfs_header_generation(buf);
1550 orig_root = btrfs_header_owner(orig_buf);
1551 orig_generation = btrfs_header_generation(orig_buf);
1552
1553 nritems = btrfs_header_nritems(buf);
1554 level = btrfs_header_level(buf);
1555
1556 if (root->ref_cows) {
1557 process_func = __btrfs_inc_extent_ref;
1558 } else {
1559 if (level == 0 &&
1560 root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
1561 goto out;
1562 if (level != 0 &&
1563 root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID)
1564 goto out;
1565 process_func = __btrfs_update_extent_ref;
1566 }
1567
1568 for (i = 0; i < nritems; i++) {
1569 cond_resched();
1570 if (level == 0) {
1571 btrfs_item_key_to_cpu(buf, &key, i);
1572 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
1573 continue;
1574 fi = btrfs_item_ptr(buf, i,
1575 struct btrfs_file_extent_item);
1576 if (btrfs_file_extent_type(buf, fi) ==
1577 BTRFS_FILE_EXTENT_INLINE)
1578 continue;
1579 bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
1580 if (bytenr == 0)
1581 continue;
1582
1583 nr_file_extents++;
1584
1585 ret = process_func(trans, root, bytenr,
1586 orig_buf->start, buf->start,
1587 orig_root, ref_root,
1588 orig_generation, ref_generation,
1589 key.objectid);
1590
1591 if (ret) {
1592 faili = i;
1593 WARN_ON(1);
1594 goto fail;
1595 }
1596 } else {
1597 bytenr = btrfs_node_blockptr(buf, i);
1598 ret = process_func(trans, root, bytenr,
1599 orig_buf->start, buf->start,
1600 orig_root, ref_root,
1601 orig_generation, ref_generation,
1602 level - 1);
1603 if (ret) {
1604 faili = i;
1605 WARN_ON(1);
1606 goto fail;
1607 }
1608 }
1609 }
1610out:
1611 if (nr_extents) {
1612 if (level == 0)
1613 *nr_extents = nr_file_extents;
1614 else
1615 *nr_extents = nritems;
1616 }
1617 return 0;
1618fail:
1619 WARN_ON(1);
1620 return ret;
1621}
1622
1623int btrfs_update_ref(struct btrfs_trans_handle *trans,
1624 struct btrfs_root *root, struct extent_buffer *orig_buf,
1625 struct extent_buffer *buf, int start_slot, int nr)
1626
1627{
1628 u64 bytenr;
1629 u64 ref_root;
1630 u64 orig_root;
1631 u64 ref_generation;
1632 u64 orig_generation;
1633 struct btrfs_key key;
1634 struct btrfs_file_extent_item *fi;
1635 int i;
1636 int ret;
1637 int slot;
1638 int level;
1639
1640 BUG_ON(start_slot < 0);
1641 BUG_ON(start_slot + nr > btrfs_header_nritems(buf));
1642
1643 ref_root = btrfs_header_owner(buf);
1644 ref_generation = btrfs_header_generation(buf);
1645 orig_root = btrfs_header_owner(orig_buf);
1646 orig_generation = btrfs_header_generation(orig_buf);
1647 level = btrfs_header_level(buf);
1648
1649 if (!root->ref_cows) {
1650 if (level == 0 &&
1651 root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
1652 return 0;
1653 if (level != 0 &&
1654 root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID)
1655 return 0;
1656 }
1657
1658 for (i = 0, slot = start_slot; i < nr; i++, slot++) {
1659 cond_resched();
1660 if (level == 0) {
1661 btrfs_item_key_to_cpu(buf, &key, slot);
1662 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
1663 continue;
1664 fi = btrfs_item_ptr(buf, slot,
1665 struct btrfs_file_extent_item);
1666 if (btrfs_file_extent_type(buf, fi) ==
1667 BTRFS_FILE_EXTENT_INLINE)
1668 continue;
1669 bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
1670 if (bytenr == 0)
1671 continue;
1672 ret = __btrfs_update_extent_ref(trans, root, bytenr,
1673 orig_buf->start, buf->start,
1674 orig_root, ref_root,
1675 orig_generation, ref_generation,
1676 key.objectid);
1677 if (ret)
1678 goto fail;
1679 } else {
1680 bytenr = btrfs_node_blockptr(buf, slot);
1681 ret = __btrfs_update_extent_ref(trans, root, bytenr,
1682 orig_buf->start, buf->start,
1683 orig_root, ref_root,
1684 orig_generation, ref_generation,
1685 level - 1);
1686 if (ret)
1687 goto fail;
1688 }
1689 }
1690 return 0;
1691fail:
1692 WARN_ON(1);
1693 return -1;
1694}
1695
1696static int write_one_cache_group(struct btrfs_trans_handle *trans,
1697 struct btrfs_root *root,
1698 struct btrfs_path *path,
1699 struct btrfs_block_group_cache *cache)
1700{
1701 int ret;
1702 int pending_ret;
1703 struct btrfs_root *extent_root = root->fs_info->extent_root;
1704 unsigned long bi;
1705 struct extent_buffer *leaf;
1706
1707 ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
1708 if (ret < 0)
1709 goto fail;
1710 BUG_ON(ret);
1711
1712 leaf = path->nodes[0];
1713 bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
1714 write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
1715 btrfs_mark_buffer_dirty(leaf);
1716 btrfs_release_path(extent_root, path);
1717fail:
1718 finish_current_insert(trans, extent_root, 0);
1719 pending_ret = del_pending_extents(trans, extent_root, 0);
1720 if (ret)
1721 return ret;
1722 if (pending_ret)
1723 return pending_ret;
1724 return 0;
1725
1726}
1727
1728int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
1729 struct btrfs_root *root)
1730{
1731 struct btrfs_block_group_cache *cache, *entry;
1732 struct rb_node *n;
1733 int err = 0;
1734 int werr = 0;
1735 struct btrfs_path *path;
1736 u64 last = 0;
1737
1738 path = btrfs_alloc_path();
1739 if (!path)
1740 return -ENOMEM;
1741
1742 while (1) {
1743 cache = NULL;
1744 spin_lock(&root->fs_info->block_group_cache_lock);
1745 for (n = rb_first(&root->fs_info->block_group_cache_tree);
1746 n; n = rb_next(n)) {
1747 entry = rb_entry(n, struct btrfs_block_group_cache,
1748 cache_node);
1749 if (entry->dirty) {
1750 cache = entry;
1751 break;
1752 }
1753 }
1754 spin_unlock(&root->fs_info->block_group_cache_lock);
1755
1756 if (!cache)
1757 break;
1758
1759 cache->dirty = 0;
1760 last += cache->key.offset;
1761
1762 err = write_one_cache_group(trans, root,
1763 path, cache);
1764 /*
1765 * if we fail to write the cache group, we want
1766 * to keep it marked dirty in hopes that a later
1767 * write will work
1768 */
1769 if (err) {
1770 werr = err;
1771 continue;
1772 }
1773 }
1774 btrfs_free_path(path);
1775 return werr;
1776}
1777
1778int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
1779{
1780 struct btrfs_block_group_cache *block_group;
1781 int readonly = 0;
1782
1783 block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
1784 if (!block_group || block_group->ro)
1785 readonly = 1;
1786 if (block_group)
1787 put_block_group(block_group);
1788 return readonly;
1789}
1790
1791static int update_space_info(struct btrfs_fs_info *info, u64 flags,
1792 u64 total_bytes, u64 bytes_used,
1793 struct btrfs_space_info **space_info)
1794{
1795 struct btrfs_space_info *found;
1796
1797 found = __find_space_info(info, flags);
1798 if (found) {
1799 spin_lock(&found->lock);
1800 found->total_bytes += total_bytes;
1801 found->bytes_used += bytes_used;
1802 found->full = 0;
1803 spin_unlock(&found->lock);
1804 *space_info = found;
1805 return 0;
1806 }
1807 found = kzalloc(sizeof(*found), GFP_NOFS);
1808 if (!found)
1809 return -ENOMEM;
1810
1811 list_add(&found->list, &info->space_info);
1812 INIT_LIST_HEAD(&found->block_groups);
1813 init_rwsem(&found->groups_sem);
1814 spin_lock_init(&found->lock);
1815 found->flags = flags;
1816 found->total_bytes = total_bytes;
1817 found->bytes_used = bytes_used;
1818 found->bytes_pinned = 0;
1819 found->bytes_reserved = 0;
1820 found->bytes_readonly = 0;
1821 found->full = 0;
1822 found->force_alloc = 0;
1823 *space_info = found;
1824 return 0;
1825}
1826
1827static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
1828{
1829 u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 |
1830 BTRFS_BLOCK_GROUP_RAID1 |
1831 BTRFS_BLOCK_GROUP_RAID10 |
1832 BTRFS_BLOCK_GROUP_DUP);
1833 if (extra_flags) {
1834 if (flags & BTRFS_BLOCK_GROUP_DATA)
1835 fs_info->avail_data_alloc_bits |= extra_flags;
1836 if (flags & BTRFS_BLOCK_GROUP_METADATA)
1837 fs_info->avail_metadata_alloc_bits |= extra_flags;
1838 if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
1839 fs_info->avail_system_alloc_bits |= extra_flags;
1840 }
1841}
1842
1843static void set_block_group_readonly(struct btrfs_block_group_cache *cache)
1844{
1845 spin_lock(&cache->space_info->lock);
1846 spin_lock(&cache->lock);
1847 if (!cache->ro) {
1848 cache->space_info->bytes_readonly += cache->key.offset -
1849 btrfs_block_group_used(&cache->item);
1850 cache->ro = 1;
1851 }
1852 spin_unlock(&cache->lock);
1853 spin_unlock(&cache->space_info->lock);
1854}
1855
1856u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
1857{
1858 u64 num_devices = root->fs_info->fs_devices->rw_devices;
1859
1860 if (num_devices == 1)
1861 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
1862 if (num_devices < 4)
1863 flags &= ~BTRFS_BLOCK_GROUP_RAID10;
1864
1865 if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
1866 (flags & (BTRFS_BLOCK_GROUP_RAID1 |
1867 BTRFS_BLOCK_GROUP_RAID10))) {
1868 flags &= ~BTRFS_BLOCK_GROUP_DUP;
1869 }
1870
1871 if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
1872 (flags & BTRFS_BLOCK_GROUP_RAID10)) {
1873 flags &= ~BTRFS_BLOCK_GROUP_RAID1;
1874 }
1875
1876 if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
1877 ((flags & BTRFS_BLOCK_GROUP_RAID1) |
1878 (flags & BTRFS_BLOCK_GROUP_RAID10) |
1879 (flags & BTRFS_BLOCK_GROUP_DUP)))
1880 flags &= ~BTRFS_BLOCK_GROUP_RAID0;
1881 return flags;
1882}
1883
1884static int do_chunk_alloc(struct btrfs_trans_handle *trans,
1885 struct btrfs_root *extent_root, u64 alloc_bytes,
1886 u64 flags, int force)
1887{
1888 struct btrfs_space_info *space_info;
1889 u64 thresh;
1890 int ret = 0;
1891
1892 mutex_lock(&extent_root->fs_info->chunk_mutex);
1893
1894 flags = btrfs_reduce_alloc_profile(extent_root, flags);
1895
1896 space_info = __find_space_info(extent_root->fs_info, flags);
1897 if (!space_info) {
1898 ret = update_space_info(extent_root->fs_info, flags,
1899 0, 0, &space_info);
1900 BUG_ON(ret);
1901 }
1902 BUG_ON(!space_info);
1903
1904 spin_lock(&space_info->lock);
1905 if (space_info->force_alloc) {
1906 force = 1;
1907 space_info->force_alloc = 0;
1908 }
1909 if (space_info->full) {
1910 spin_unlock(&space_info->lock);
1911 goto out;
1912 }
1913
1914 thresh = space_info->total_bytes - space_info->bytes_readonly;
1915 thresh = div_factor(thresh, 6);
1916 if (!force &&
1917 (space_info->bytes_used + space_info->bytes_pinned +
1918 space_info->bytes_reserved + alloc_bytes) < thresh) {
1919 spin_unlock(&space_info->lock);
1920 goto out;
1921 }
1922 spin_unlock(&space_info->lock);
1923
1924 ret = btrfs_alloc_chunk(trans, extent_root, flags);
1925 if (ret)
1926 space_info->full = 1;
1927out:
1928 mutex_unlock(&extent_root->fs_info->chunk_mutex);
1929 return ret;
1930}
1931
1932static int update_block_group(struct btrfs_trans_handle *trans,
1933 struct btrfs_root *root,
1934 u64 bytenr, u64 num_bytes, int alloc,
1935 int mark_free)
1936{
1937 struct btrfs_block_group_cache *cache;
1938 struct btrfs_fs_info *info = root->fs_info;
1939 u64 total = num_bytes;
1940 u64 old_val;
1941 u64 byte_in_group;
1942
1943 while (total) {
1944 cache = btrfs_lookup_block_group(info, bytenr);
1945 if (!cache)
1946 return -1;
1947 byte_in_group = bytenr - cache->key.objectid;
1948 WARN_ON(byte_in_group > cache->key.offset);
1949
1950 spin_lock(&cache->space_info->lock);
1951 spin_lock(&cache->lock);
1952 cache->dirty = 1;
1953 old_val = btrfs_block_group_used(&cache->item);
1954 num_bytes = min(total, cache->key.offset - byte_in_group);
1955 if (alloc) {
1956 old_val += num_bytes;
1957 cache->space_info->bytes_used += num_bytes;
1958 if (cache->ro)
1959 cache->space_info->bytes_readonly -= num_bytes;
1960 btrfs_set_block_group_used(&cache->item, old_val);
1961 spin_unlock(&cache->lock);
1962 spin_unlock(&cache->space_info->lock);
1963 } else {
1964 old_val -= num_bytes;
1965 cache->space_info->bytes_used -= num_bytes;
1966 if (cache->ro)
1967 cache->space_info->bytes_readonly += num_bytes;
1968 btrfs_set_block_group_used(&cache->item, old_val);
1969 spin_unlock(&cache->lock);
1970 spin_unlock(&cache->space_info->lock);
1971 if (mark_free) {
1972 int ret;
1973
1974 ret = btrfs_discard_extent(root, bytenr,
1975 num_bytes);
1976 WARN_ON(ret);
1977
1978 ret = btrfs_add_free_space(cache, bytenr,
1979 num_bytes);
1980 WARN_ON(ret);
1981 }
1982 }
1983 put_block_group(cache);
1984 total -= num_bytes;
1985 bytenr += num_bytes;
1986 }
1987 return 0;
1988}
1989
1990static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
1991{
1992 struct btrfs_block_group_cache *cache;
1993 u64 bytenr;
1994
1995 cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
1996 if (!cache)
1997 return 0;
1998
1999 bytenr = cache->key.objectid;
2000 put_block_group(cache);
2001
2002 return bytenr;
2003}
2004
2005int btrfs_update_pinned_extents(struct btrfs_root *root,
2006 u64 bytenr, u64 num, int pin)
2007{
2008 u64 len;
2009 struct btrfs_block_group_cache *cache;
2010 struct btrfs_fs_info *fs_info = root->fs_info;
2011
2012 WARN_ON(!mutex_is_locked(&root->fs_info->pinned_mutex));
2013 if (pin) {
2014 set_extent_dirty(&fs_info->pinned_extents,
2015 bytenr, bytenr + num - 1, GFP_NOFS);
2016 } else {
2017 clear_extent_dirty(&fs_info->pinned_extents,
2018 bytenr, bytenr + num - 1, GFP_NOFS);
2019 }
2020 while (num > 0) {
2021 cache = btrfs_lookup_block_group(fs_info, bytenr);
2022 BUG_ON(!cache);
2023 len = min(num, cache->key.offset -
2024 (bytenr - cache->key.objectid));
2025 if (pin) {
2026 spin_lock(&cache->space_info->lock);
2027 spin_lock(&cache->lock);
2028 cache->pinned += len;
2029 cache->space_info->bytes_pinned += len;
2030 spin_unlock(&cache->lock);
2031 spin_unlock(&cache->space_info->lock);
2032 fs_info->total_pinned += len;
2033 } else {
2034 spin_lock(&cache->space_info->lock);
2035 spin_lock(&cache->lock);
2036 cache->pinned -= len;
2037 cache->space_info->bytes_pinned -= len;
2038 spin_unlock(&cache->lock);
2039 spin_unlock(&cache->space_info->lock);
2040 fs_info->total_pinned -= len;
2041 if (cache->cached)
2042 btrfs_add_free_space(cache, bytenr, len);
2043 }
2044 put_block_group(cache);
2045 bytenr += len;
2046 num -= len;
2047 }
2048 return 0;
2049}
2050
2051static int update_reserved_extents(struct btrfs_root *root,
2052 u64 bytenr, u64 num, int reserve)
2053{
2054 u64 len;
2055 struct btrfs_block_group_cache *cache;
2056 struct btrfs_fs_info *fs_info = root->fs_info;
2057
2058 while (num > 0) {
2059 cache = btrfs_lookup_block_group(fs_info, bytenr);
2060 BUG_ON(!cache);
2061 len = min(num, cache->key.offset -
2062 (bytenr - cache->key.objectid));
2063
2064 spin_lock(&cache->space_info->lock);
2065 spin_lock(&cache->lock);
2066 if (reserve) {
2067 cache->reserved += len;
2068 cache->space_info->bytes_reserved += len;
2069 } else {
2070 cache->reserved -= len;
2071 cache->space_info->bytes_reserved -= len;
2072 }
2073 spin_unlock(&cache->lock);
2074 spin_unlock(&cache->space_info->lock);
2075 put_block_group(cache);
2076 bytenr += len;
2077 num -= len;
2078 }
2079 return 0;
2080}
2081
2082int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
2083{
2084 u64 last = 0;
2085 u64 start;
2086 u64 end;
2087 struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents;
2088 int ret;
2089
2090 mutex_lock(&root->fs_info->pinned_mutex);
2091 while (1) {
2092 ret = find_first_extent_bit(pinned_extents, last,
2093 &start, &end, EXTENT_DIRTY);
2094 if (ret)
2095 break;
2096 set_extent_dirty(copy, start, end, GFP_NOFS);
2097 last = end + 1;
2098 }
2099 mutex_unlock(&root->fs_info->pinned_mutex);
2100 return 0;
2101}
2102
2103int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
2104 struct btrfs_root *root,
2105 struct extent_io_tree *unpin)
2106{
2107 u64 start;
2108 u64 end;
2109 int ret;
2110
2111 mutex_lock(&root->fs_info->pinned_mutex);
2112 while (1) {
2113 ret = find_first_extent_bit(unpin, 0, &start, &end,
2114 EXTENT_DIRTY);
2115 if (ret)
2116 break;
2117
2118 ret = btrfs_discard_extent(root, start, end + 1 - start);
2119
2120 btrfs_update_pinned_extents(root, start, end + 1 - start, 0);
2121 clear_extent_dirty(unpin, start, end, GFP_NOFS);
2122
2123 if (need_resched()) {
2124 mutex_unlock(&root->fs_info->pinned_mutex);
2125 cond_resched();
2126 mutex_lock(&root->fs_info->pinned_mutex);
2127 }
2128 }
2129 mutex_unlock(&root->fs_info->pinned_mutex);
2130 return ret;
2131}
2132
2133static int finish_current_insert(struct btrfs_trans_handle *trans,
2134 struct btrfs_root *extent_root, int all)
2135{
2136 u64 start;
2137 u64 end;
2138 u64 priv;
2139 u64 search = 0;
2140 u64 skipped = 0;
2141 struct btrfs_fs_info *info = extent_root->fs_info;
2142 struct btrfs_path *path;
2143 struct pending_extent_op *extent_op, *tmp;
2144 struct list_head insert_list, update_list;
2145 int ret;
2146 int num_inserts = 0, max_inserts;
2147
2148 path = btrfs_alloc_path();
2149 INIT_LIST_HEAD(&insert_list);
2150 INIT_LIST_HEAD(&update_list);
2151
2152 max_inserts = extent_root->leafsize /
2153 (2 * sizeof(struct btrfs_key) + 2 * sizeof(struct btrfs_item) +
2154 sizeof(struct btrfs_extent_ref) +
2155 sizeof(struct btrfs_extent_item));
2156again:
2157 mutex_lock(&info->extent_ins_mutex);
2158 while (1) {
2159 ret = find_first_extent_bit(&info->extent_ins, search, &start,
2160 &end, EXTENT_WRITEBACK);
2161 if (ret) {
2162 if (skipped && all && !num_inserts) {
2163 skipped = 0;
2164 search = 0;
2165 continue;
2166 }
2167 mutex_unlock(&info->extent_ins_mutex);
2168 break;
2169 }
2170
2171 ret = try_lock_extent(&info->extent_ins, start, end, GFP_NOFS);
2172 if (!ret) {
2173 skipped = 1;
2174 search = end + 1;
2175 if (need_resched()) {
2176 mutex_unlock(&info->extent_ins_mutex);
2177 cond_resched();
2178 mutex_lock(&info->extent_ins_mutex);
2179 }
2180 continue;
2181 }
2182
2183 ret = get_state_private(&info->extent_ins, start, &priv);
2184 BUG_ON(ret);
2185 extent_op = (struct pending_extent_op *)(unsigned long) priv;
2186
2187 if (extent_op->type == PENDING_EXTENT_INSERT) {
2188 num_inserts++;
2189 list_add_tail(&extent_op->list, &insert_list);
2190 search = end + 1;
2191 if (num_inserts == max_inserts) {
2192 mutex_unlock(&info->extent_ins_mutex);
2193 break;
2194 }
2195 } else if (extent_op->type == PENDING_BACKREF_UPDATE) {
2196 list_add_tail(&extent_op->list, &update_list);
2197 search = end + 1;
2198 } else {
2199 BUG();
2200 }
2201 }
2202
2203 /*
2204 * process the update list, clear the writeback bit for it, and if
2205 * somebody marked this thing for deletion then just unlock it and be
2206 * done, the free_extents will handle it
2207 */
2208 mutex_lock(&info->extent_ins_mutex);
2209 list_for_each_entry_safe(extent_op, tmp, &update_list, list) {
2210 clear_extent_bits(&info->extent_ins, extent_op->bytenr,
2211 extent_op->bytenr + extent_op->num_bytes - 1,
2212 EXTENT_WRITEBACK, GFP_NOFS);
2213 if (extent_op->del) {
2214 list_del_init(&extent_op->list);
2215 unlock_extent(&info->extent_ins, extent_op->bytenr,
2216 extent_op->bytenr + extent_op->num_bytes
2217 - 1, GFP_NOFS);
2218 kfree(extent_op);
2219 }
2220 }
2221 mutex_unlock(&info->extent_ins_mutex);
2222
2223 /*
2224 * still have things left on the update list, go ahead an update
2225 * everything
2226 */
2227 if (!list_empty(&update_list)) {
2228 ret = update_backrefs(trans, extent_root, path, &update_list);
2229 BUG_ON(ret);
2230 }
2231
2232 /*
2233 * if no inserts need to be done, but we skipped some extents and we
2234 * need to make sure everything is cleaned then reset everything and
2235 * go back to the beginning
2236 */
2237 if (!num_inserts && all && skipped) {
2238 search = 0;
2239 skipped = 0;
2240 INIT_LIST_HEAD(&update_list);
2241 INIT_LIST_HEAD(&insert_list);
2242 goto again;
2243 } else if (!num_inserts) {
2244 goto out;
2245 }
2246
2247 /*
2248 * process the insert extents list. Again if we are deleting this
2249 * extent, then just unlock it, pin down the bytes if need be, and be
2250 * done with it. Saves us from having to actually insert the extent
2251 * into the tree and then subsequently come along and delete it
2252 */
2253 mutex_lock(&info->extent_ins_mutex);
2254 list_for_each_entry_safe(extent_op, tmp, &insert_list, list) {
2255 clear_extent_bits(&info->extent_ins, extent_op->bytenr,
2256 extent_op->bytenr + extent_op->num_bytes - 1,
2257 EXTENT_WRITEBACK, GFP_NOFS);
2258 if (extent_op->del) {
2259 u64 used;
2260 list_del_init(&extent_op->list);
2261 unlock_extent(&info->extent_ins, extent_op->bytenr,
2262 extent_op->bytenr + extent_op->num_bytes
2263 - 1, GFP_NOFS);
2264
2265 mutex_lock(&extent_root->fs_info->pinned_mutex);
2266 ret = pin_down_bytes(trans, extent_root,
2267 extent_op->bytenr,
2268 extent_op->num_bytes, 0);
2269 mutex_unlock(&extent_root->fs_info->pinned_mutex);
2270
2271 spin_lock(&info->delalloc_lock);
2272 used = btrfs_super_bytes_used(&info->super_copy);
2273 btrfs_set_super_bytes_used(&info->super_copy,
2274 used - extent_op->num_bytes);
2275 used = btrfs_root_used(&extent_root->root_item);
2276 btrfs_set_root_used(&extent_root->root_item,
2277 used - extent_op->num_bytes);
2278 spin_unlock(&info->delalloc_lock);
2279
2280 ret = update_block_group(trans, extent_root,
2281 extent_op->bytenr,
2282 extent_op->num_bytes,
2283 0, ret > 0);
2284 BUG_ON(ret);
2285 kfree(extent_op);
2286 num_inserts--;
2287 }
2288 }
2289 mutex_unlock(&info->extent_ins_mutex);
2290
2291 ret = insert_extents(trans, extent_root, path, &insert_list,
2292 num_inserts);
2293 BUG_ON(ret);
2294
2295 /*
2296 * if we broke out of the loop in order to insert stuff because we hit
2297 * the maximum number of inserts at a time we can handle, then loop
2298 * back and pick up where we left off
2299 */
2300 if (num_inserts == max_inserts) {
2301 INIT_LIST_HEAD(&insert_list);
2302 INIT_LIST_HEAD(&update_list);
2303 num_inserts = 0;
2304 goto again;
2305 }
2306
2307 /*
2308 * again, if we need to make absolutely sure there are no more pending
2309 * extent operations left and we know that we skipped some, go back to
2310 * the beginning and do it all again
2311 */
2312 if (all && skipped) {
2313 INIT_LIST_HEAD(&insert_list);
2314 INIT_LIST_HEAD(&update_list);
2315 search = 0;
2316 skipped = 0;
2317 num_inserts = 0;
2318 goto again;
2319 }
2320out:
2321 btrfs_free_path(path);
2322 return 0;
2323}
2324
2325static int pin_down_bytes(struct btrfs_trans_handle *trans,
2326 struct btrfs_root *root,
2327 u64 bytenr, u64 num_bytes, int is_data)
2328{
2329 int err = 0;
2330 struct extent_buffer *buf;
2331
2332 if (is_data)
2333 goto pinit;
2334
2335 buf = btrfs_find_tree_block(root, bytenr, num_bytes);
2336 if (!buf)
2337 goto pinit;
2338
2339 /* we can reuse a block if it hasn't been written
2340 * and it is from this transaction. We can't
2341 * reuse anything from the tree log root because
2342 * it has tiny sub-transactions.
2343 */
2344 if (btrfs_buffer_uptodate(buf, 0) &&
2345 btrfs_try_tree_lock(buf)) {
2346 u64 header_owner = btrfs_header_owner(buf);
2347 u64 header_transid = btrfs_header_generation(buf);
2348 if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
2349 header_owner != BTRFS_TREE_RELOC_OBJECTID &&
2350 header_transid == trans->transid &&
2351 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
2352 clean_tree_block(NULL, root, buf);
2353 btrfs_tree_unlock(buf);
2354 free_extent_buffer(buf);
2355 return 1;
2356 }
2357 btrfs_tree_unlock(buf);
2358 }
2359 free_extent_buffer(buf);
2360pinit:
2361 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
2362
2363 BUG_ON(err < 0);
2364 return 0;
2365}
2366
2367/*
2368 * remove an extent from the root, returns 0 on success
2369 */
2370static int __free_extent(struct btrfs_trans_handle *trans,
2371 struct btrfs_root *root,
2372 u64 bytenr, u64 num_bytes, u64 parent,
2373 u64 root_objectid, u64 ref_generation,
2374 u64 owner_objectid, int pin, int mark_free)
2375{
2376 struct btrfs_path *path;
2377 struct btrfs_key key;
2378 struct btrfs_fs_info *info = root->fs_info;
2379 struct btrfs_root *extent_root = info->extent_root;
2380 struct extent_buffer *leaf;
2381 int ret;
2382 int extent_slot = 0;
2383 int found_extent = 0;
2384 int num_to_del = 1;
2385 struct btrfs_extent_item *ei;
2386 u32 refs;
2387
2388 key.objectid = bytenr;
2389 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
2390 key.offset = num_bytes;
2391 path = btrfs_alloc_path();
2392 if (!path)
2393 return -ENOMEM;
2394
2395 path->reada = 1;
2396 ret = lookup_extent_backref(trans, extent_root, path,
2397 bytenr, parent, root_objectid,
2398 ref_generation, owner_objectid, 1);
2399 if (ret == 0) {
2400 struct btrfs_key found_key;
2401 extent_slot = path->slots[0];
2402 while (extent_slot > 0) {
2403 extent_slot--;
2404 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2405 extent_slot);
2406 if (found_key.objectid != bytenr)
2407 break;
2408 if (found_key.type == BTRFS_EXTENT_ITEM_KEY &&
2409 found_key.offset == num_bytes) {
2410 found_extent = 1;
2411 break;
2412 }
2413 if (path->slots[0] - extent_slot > 5)
2414 break;
2415 }
2416 if (!found_extent) {
2417 ret = remove_extent_backref(trans, extent_root, path);
2418 BUG_ON(ret);
2419 btrfs_release_path(extent_root, path);
2420 ret = btrfs_search_slot(trans, extent_root,
2421 &key, path, -1, 1);
2422 if (ret) {
2423 printk(KERN_ERR "umm, got %d back from search"
2424 ", was looking for %llu\n", ret,
2425 (unsigned long long)bytenr);
2426 btrfs_print_leaf(extent_root, path->nodes[0]);
2427 }
2428 BUG_ON(ret);
2429 extent_slot = path->slots[0];
2430 }
2431 } else {
2432 btrfs_print_leaf(extent_root, path->nodes[0]);
2433 WARN_ON(1);
2434 printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
2435 "root %llu gen %llu owner %llu\n",
2436 (unsigned long long)bytenr,
2437 (unsigned long long)root_objectid,
2438 (unsigned long long)ref_generation,
2439 (unsigned long long)owner_objectid);
2440 }
2441
2442 leaf = path->nodes[0];
2443 ei = btrfs_item_ptr(leaf, extent_slot,
2444 struct btrfs_extent_item);
2445 refs = btrfs_extent_refs(leaf, ei);
2446 BUG_ON(refs == 0);
2447 refs -= 1;
2448 btrfs_set_extent_refs(leaf, ei, refs);
2449
2450 btrfs_mark_buffer_dirty(leaf);
2451
2452 if (refs == 0 && found_extent && path->slots[0] == extent_slot + 1) {
2453 struct btrfs_extent_ref *ref;
2454 ref = btrfs_item_ptr(leaf, path->slots[0],
2455 struct btrfs_extent_ref);
2456 BUG_ON(btrfs_ref_num_refs(leaf, ref) != 1);
2457 /* if the back ref and the extent are next to each other
2458 * they get deleted below in one shot
2459 */
2460 path->slots[0] = extent_slot;
2461 num_to_del = 2;
2462 } else if (found_extent) {
2463 /* otherwise delete the extent back ref */
2464 ret = remove_extent_backref(trans, extent_root, path);
2465 BUG_ON(ret);
2466 /* if refs are 0, we need to setup the path for deletion */
2467 if (refs == 0) {
2468 btrfs_release_path(extent_root, path);
2469 ret = btrfs_search_slot(trans, extent_root, &key, path,
2470 -1, 1);
2471 BUG_ON(ret);
2472 }
2473 }
2474
2475 if (refs == 0) {
2476 u64 super_used;
2477 u64 root_used;
2478
2479 if (pin) {
2480 mutex_lock(&root->fs_info->pinned_mutex);
2481 ret = pin_down_bytes(trans, root, bytenr, num_bytes,
2482 owner_objectid >= BTRFS_FIRST_FREE_OBJECTID);
2483 mutex_unlock(&root->fs_info->pinned_mutex);
2484 if (ret > 0)
2485 mark_free = 1;
2486 BUG_ON(ret < 0);
2487 }
2488 /* block accounting for super block */
2489 spin_lock(&info->delalloc_lock);
2490 super_used = btrfs_super_bytes_used(&info->super_copy);
2491 btrfs_set_super_bytes_used(&info->super_copy,
2492 super_used - num_bytes);
2493
2494 /* block accounting for root item */
2495 root_used = btrfs_root_used(&root->root_item);
2496 btrfs_set_root_used(&root->root_item,
2497 root_used - num_bytes);
2498 spin_unlock(&info->delalloc_lock);
2499 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
2500 num_to_del);
2501 BUG_ON(ret);
2502 btrfs_release_path(extent_root, path);
2503
2504 if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
2505 ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
2506 BUG_ON(ret);
2507 }
2508
2509 ret = update_block_group(trans, root, bytenr, num_bytes, 0,
2510 mark_free);
2511 BUG_ON(ret);
2512 }
2513 btrfs_free_path(path);
2514 finish_current_insert(trans, extent_root, 0);
2515 return ret;
2516}
2517
2518/*
2519 * find all the blocks marked as pending in the radix tree and remove
2520 * them from the extent map
2521 */
2522static int del_pending_extents(struct btrfs_trans_handle *trans,
2523 struct btrfs_root *extent_root, int all)
2524{
2525 int ret;
2526 int err = 0;
2527 u64 start;
2528 u64 end;
2529 u64 priv;
2530 u64 search = 0;
2531 int nr = 0, skipped = 0;
2532 struct extent_io_tree *pending_del;
2533 struct extent_io_tree *extent_ins;
2534 struct pending_extent_op *extent_op;
2535 struct btrfs_fs_info *info = extent_root->fs_info;
2536 struct list_head delete_list;
2537
2538 INIT_LIST_HEAD(&delete_list);
2539 extent_ins = &extent_root->fs_info->extent_ins;
2540 pending_del = &extent_root->fs_info->pending_del;
2541
2542again:
2543 mutex_lock(&info->extent_ins_mutex);
2544 while (1) {
2545 ret = find_first_extent_bit(pending_del, search, &start, &end,
2546 EXTENT_WRITEBACK);
2547 if (ret) {
2548 if (all && skipped && !nr) {
2549 search = 0;
2550 continue;
2551 }
2552 mutex_unlock(&info->extent_ins_mutex);
2553 break;
2554 }
2555
2556 ret = try_lock_extent(extent_ins, start, end, GFP_NOFS);
2557 if (!ret) {
2558 search = end+1;
2559 skipped = 1;
2560
2561 if (need_resched()) {
2562 mutex_unlock(&info->extent_ins_mutex);
2563 cond_resched();
2564 mutex_lock(&info->extent_ins_mutex);
2565 }
2566
2567 continue;
2568 }
2569 BUG_ON(ret < 0);
2570
2571 ret = get_state_private(pending_del, start, &priv);
2572 BUG_ON(ret);
2573 extent_op = (struct pending_extent_op *)(unsigned long)priv;
2574
2575 clear_extent_bits(pending_del, start, end, EXTENT_WRITEBACK,
2576 GFP_NOFS);
2577 if (!test_range_bit(extent_ins, start, end,
2578 EXTENT_WRITEBACK, 0)) {
2579 list_add_tail(&extent_op->list, &delete_list);
2580 nr++;
2581 } else {
2582 kfree(extent_op);
2583
2584 ret = get_state_private(&info->extent_ins, start,
2585 &priv);
2586 BUG_ON(ret);
2587 extent_op = (struct pending_extent_op *)
2588 (unsigned long)priv;
2589
2590 clear_extent_bits(&info->extent_ins, start, end,
2591 EXTENT_WRITEBACK, GFP_NOFS);
2592
2593 if (extent_op->type == PENDING_BACKREF_UPDATE) {
2594 list_add_tail(&extent_op->list, &delete_list);
2595 search = end + 1;
2596 nr++;
2597 continue;
2598 }
2599
2600 mutex_lock(&extent_root->fs_info->pinned_mutex);
2601 ret = pin_down_bytes(trans, extent_root, start,
2602 end + 1 - start, 0);
2603 mutex_unlock(&extent_root->fs_info->pinned_mutex);
2604
2605 ret = update_block_group(trans, extent_root, start,
2606 end + 1 - start, 0, ret > 0);
2607
2608 unlock_extent(extent_ins, start, end, GFP_NOFS);
2609 BUG_ON(ret);
2610 kfree(extent_op);
2611 }
2612 if (ret)
2613 err = ret;
2614
2615 search = end + 1;
2616
2617 if (need_resched()) {
2618 mutex_unlock(&info->extent_ins_mutex);
2619 cond_resched();
2620 mutex_lock(&info->extent_ins_mutex);
2621 }
2622 }
2623
2624 if (nr) {
2625 ret = free_extents(trans, extent_root, &delete_list);
2626 BUG_ON(ret);
2627 }
2628
2629 if (all && skipped) {
2630 INIT_LIST_HEAD(&delete_list);
2631 search = 0;
2632 nr = 0;
2633 goto again;
2634 }
2635
2636 return err;
2637}
2638
2639/*
2640 * remove an extent from the root, returns 0 on success
2641 */
2642static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
2643 struct btrfs_root *root,
2644 u64 bytenr, u64 num_bytes, u64 parent,
2645 u64 root_objectid, u64 ref_generation,
2646 u64 owner_objectid, int pin)
2647{
2648 struct btrfs_root *extent_root = root->fs_info->extent_root;
2649 int pending_ret;
2650 int ret;
2651
2652 WARN_ON(num_bytes < root->sectorsize);
2653 if (root == extent_root) {
2654 struct pending_extent_op *extent_op = NULL;
2655
2656 mutex_lock(&root->fs_info->extent_ins_mutex);
2657 if (test_range_bit(&root->fs_info->extent_ins, bytenr,
2658 bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) {
2659 u64 priv;
2660 ret = get_state_private(&root->fs_info->extent_ins,
2661 bytenr, &priv);
2662 BUG_ON(ret);
2663 extent_op = (struct pending_extent_op *)
2664 (unsigned long)priv;
2665
2666 extent_op->del = 1;
2667 if (extent_op->type == PENDING_EXTENT_INSERT) {
2668 mutex_unlock(&root->fs_info->extent_ins_mutex);
2669 return 0;
2670 }
2671 }
2672
2673 if (extent_op) {
2674 ref_generation = extent_op->orig_generation;
2675 parent = extent_op->orig_parent;
2676 }
2677
2678 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
2679 BUG_ON(!extent_op);
2680
2681 extent_op->type = PENDING_EXTENT_DELETE;
2682 extent_op->bytenr = bytenr;
2683 extent_op->num_bytes = num_bytes;
2684 extent_op->parent = parent;
2685 extent_op->orig_parent = parent;
2686 extent_op->generation = ref_generation;
2687 extent_op->orig_generation = ref_generation;
2688 extent_op->level = (int)owner_objectid;
2689 INIT_LIST_HEAD(&extent_op->list);
2690 extent_op->del = 0;
2691
2692 set_extent_bits(&root->fs_info->pending_del,
2693 bytenr, bytenr + num_bytes - 1,
2694 EXTENT_WRITEBACK, GFP_NOFS);
2695 set_state_private(&root->fs_info->pending_del,
2696 bytenr, (unsigned long)extent_op);
2697 mutex_unlock(&root->fs_info->extent_ins_mutex);
2698 return 0;
2699 }
2700 /* if metadata always pin */
2701 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
2702 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
2703 struct btrfs_block_group_cache *cache;
2704
2705 /* btrfs_free_reserved_extent */
2706 cache = btrfs_lookup_block_group(root->fs_info, bytenr);
2707 BUG_ON(!cache);
2708 btrfs_add_free_space(cache, bytenr, num_bytes);
2709 put_block_group(cache);
2710 update_reserved_extents(root, bytenr, num_bytes, 0);
2711 return 0;
2712 }
2713 pin = 1;
2714 }
2715
2716 /* if data pin when any transaction has committed this */
2717 if (ref_generation != trans->transid)
2718 pin = 1;
2719
2720 ret = __free_extent(trans, root, bytenr, num_bytes, parent,
2721 root_objectid, ref_generation,
2722 owner_objectid, pin, pin == 0);
2723
2724 finish_current_insert(trans, root->fs_info->extent_root, 0);
2725 pending_ret = del_pending_extents(trans, root->fs_info->extent_root, 0);
2726 return ret ? ret : pending_ret;
2727}
2728
2729int btrfs_free_extent(struct btrfs_trans_handle *trans,
2730 struct btrfs_root *root,
2731 u64 bytenr, u64 num_bytes, u64 parent,
2732 u64 root_objectid, u64 ref_generation,
2733 u64 owner_objectid, int pin)
2734{
2735 int ret;
2736
2737 ret = __btrfs_free_extent(trans, root, bytenr, num_bytes, parent,
2738 root_objectid, ref_generation,
2739 owner_objectid, pin);
2740 return ret;
2741}
2742
2743static u64 stripe_align(struct btrfs_root *root, u64 val)
2744{
2745 u64 mask = ((u64)root->stripesize - 1);
2746 u64 ret = (val + mask) & ~mask;
2747 return ret;
2748}
2749
2750/*
2751 * walks the btree of allocated extents and find a hole of a given size.
2752 * The key ins is changed to record the hole:
2753 * ins->objectid == block start
2754 * ins->flags = BTRFS_EXTENT_ITEM_KEY
2755 * ins->offset == number of blocks
2756 * Any available blocks before search_start are skipped.
2757 */
2758static noinline int find_free_extent(struct btrfs_trans_handle *trans,
2759 struct btrfs_root *orig_root,
2760 u64 num_bytes, u64 empty_size,
2761 u64 search_start, u64 search_end,
2762 u64 hint_byte, struct btrfs_key *ins,
2763 u64 exclude_start, u64 exclude_nr,
2764 int data)
2765{
2766 int ret = 0;
2767 struct btrfs_root *root = orig_root->fs_info->extent_root;
2768 u64 total_needed = num_bytes;
2769 u64 *last_ptr = NULL;
2770 u64 last_wanted = 0;
2771 struct btrfs_block_group_cache *block_group = NULL;
2772 int chunk_alloc_done = 0;
2773 int empty_cluster = 2 * 1024 * 1024;
2774 int allowed_chunk_alloc = 0;
2775 struct list_head *head = NULL, *cur = NULL;
2776 int loop = 0;
2777 int extra_loop = 0;
2778 struct btrfs_space_info *space_info;
2779
2780 WARN_ON(num_bytes < root->sectorsize);
2781 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
2782 ins->objectid = 0;
2783 ins->offset = 0;
2784
2785 if (orig_root->ref_cows || empty_size)
2786 allowed_chunk_alloc = 1;
2787
2788 if (data & BTRFS_BLOCK_GROUP_METADATA) {
2789 last_ptr = &root->fs_info->last_alloc;
2790 empty_cluster = 64 * 1024;
2791 }
2792
2793 if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD))
2794 last_ptr = &root->fs_info->last_data_alloc;
2795
2796 if (last_ptr) {
2797 if (*last_ptr) {
2798 hint_byte = *last_ptr;
2799 last_wanted = *last_ptr;
2800 } else
2801 empty_size += empty_cluster;
2802 } else {
2803 empty_cluster = 0;
2804 }
2805 search_start = max(search_start, first_logical_byte(root, 0));
2806 search_start = max(search_start, hint_byte);
2807
2808 if (last_wanted && search_start != last_wanted) {
2809 last_wanted = 0;
2810 empty_size += empty_cluster;
2811 }
2812
2813 total_needed += empty_size;
2814 block_group = btrfs_lookup_block_group(root->fs_info, search_start);
2815 if (!block_group)
2816 block_group = btrfs_lookup_first_block_group(root->fs_info,
2817 search_start);
2818 space_info = __find_space_info(root->fs_info, data);
2819
2820 down_read(&space_info->groups_sem);
2821 while (1) {
2822 struct btrfs_free_space *free_space;
2823 /*
2824 * the only way this happens if our hint points to a block
2825 * group thats not of the proper type, while looping this
2826 * should never happen
2827 */
2828 if (empty_size)
2829 extra_loop = 1;
2830
2831 if (!block_group)
2832 goto new_group_no_lock;
2833
2834 if (unlikely(!block_group->cached)) {
2835 mutex_lock(&block_group->cache_mutex);
2836 ret = cache_block_group(root, block_group);
2837 mutex_unlock(&block_group->cache_mutex);
2838 if (ret)
2839 break;
2840 }
2841
2842 mutex_lock(&block_group->alloc_mutex);
2843 if (unlikely(!block_group_bits(block_group, data)))
2844 goto new_group;
2845
2846 if (unlikely(block_group->ro))
2847 goto new_group;
2848
2849 free_space = btrfs_find_free_space(block_group, search_start,
2850 total_needed);
2851 if (free_space) {
2852 u64 start = block_group->key.objectid;
2853 u64 end = block_group->key.objectid +
2854 block_group->key.offset;
2855
2856 search_start = stripe_align(root, free_space->offset);
2857
2858 /* move on to the next group */
2859 if (search_start + num_bytes >= search_end)
2860 goto new_group;
2861
2862 /* move on to the next group */
2863 if (search_start + num_bytes > end)
2864 goto new_group;
2865
2866 if (last_wanted && search_start != last_wanted) {
2867 total_needed += empty_cluster;
2868 empty_size += empty_cluster;
2869 last_wanted = 0;
2870 /*
2871 * if search_start is still in this block group
2872 * then we just re-search this block group
2873 */
2874 if (search_start >= start &&
2875 search_start < end) {
2876 mutex_unlock(&block_group->alloc_mutex);
2877 continue;
2878 }
2879
2880 /* else we go to the next block group */
2881 goto new_group;
2882 }
2883
2884 if (exclude_nr > 0 &&
2885 (search_start + num_bytes > exclude_start &&
2886 search_start < exclude_start + exclude_nr)) {
2887 search_start = exclude_start + exclude_nr;
2888 /*
2889 * if search_start is still in this block group
2890 * then we just re-search this block group
2891 */
2892 if (search_start >= start &&
2893 search_start < end) {
2894 mutex_unlock(&block_group->alloc_mutex);
2895 last_wanted = 0;
2896 continue;
2897 }
2898
2899 /* else we go to the next block group */
2900 goto new_group;
2901 }
2902
2903 ins->objectid = search_start;
2904 ins->offset = num_bytes;
2905
2906 btrfs_remove_free_space_lock(block_group, search_start,
2907 num_bytes);
2908 /* we are all good, lets return */
2909 mutex_unlock(&block_group->alloc_mutex);
2910 break;
2911 }
2912new_group:
2913 mutex_unlock(&block_group->alloc_mutex);
2914 put_block_group(block_group);
2915 block_group = NULL;
2916new_group_no_lock:
2917 /* don't try to compare new allocations against the
2918 * last allocation any more
2919 */
2920 last_wanted = 0;
2921
2922 /*
2923 * Here's how this works.
2924 * loop == 0: we were searching a block group via a hint
2925 * and didn't find anything, so we start at
2926 * the head of the block groups and keep searching
2927 * loop == 1: we're searching through all of the block groups
2928 * if we hit the head again we have searched
2929 * all of the block groups for this space and we
2930 * need to try and allocate, if we cant error out.
2931 * loop == 2: we allocated more space and are looping through
2932 * all of the block groups again.
2933 */
2934 if (loop == 0) {
2935 head = &space_info->block_groups;
2936 cur = head->next;
2937 loop++;
2938 } else if (loop == 1 && cur == head) {
2939 int keep_going;
2940
2941 /* at this point we give up on the empty_size
2942 * allocations and just try to allocate the min
2943 * space.
2944 *
2945 * The extra_loop field was set if an empty_size
2946 * allocation was attempted above, and if this
2947 * is try we need to try the loop again without
2948 * the additional empty_size.
2949 */
2950 total_needed -= empty_size;
2951 empty_size = 0;
2952 keep_going = extra_loop;
2953 loop++;
2954
2955 if (allowed_chunk_alloc && !chunk_alloc_done) {
2956 up_read(&space_info->groups_sem);
2957 ret = do_chunk_alloc(trans, root, num_bytes +
2958 2 * 1024 * 1024, data, 1);
2959 down_read(&space_info->groups_sem);
2960 if (ret < 0)
2961 goto loop_check;
2962 head = &space_info->block_groups;
2963 /*
2964 * we've allocated a new chunk, keep
2965 * trying
2966 */
2967 keep_going = 1;
2968 chunk_alloc_done = 1;
2969 } else if (!allowed_chunk_alloc) {
2970 space_info->force_alloc = 1;
2971 }
2972loop_check:
2973 if (keep_going) {
2974 cur = head->next;
2975 extra_loop = 0;
2976 } else {
2977 break;
2978 }
2979 } else if (cur == head) {
2980 break;
2981 }
2982
2983 block_group = list_entry(cur, struct btrfs_block_group_cache,
2984 list);
2985 atomic_inc(&block_group->count);
2986
2987 search_start = block_group->key.objectid;
2988 cur = cur->next;
2989 }
2990
2991 /* we found what we needed */
2992 if (ins->objectid) {
2993 if (!(data & BTRFS_BLOCK_GROUP_DATA))
2994 trans->block_group = block_group->key.objectid;
2995
2996 if (last_ptr)
2997 *last_ptr = ins->objectid + ins->offset;
2998 ret = 0;
2999 } else if (!ret) {
3000 printk(KERN_ERR "btrfs searching for %llu bytes, "
3001 "num_bytes %llu, loop %d, allowed_alloc %d\n",
3002 (unsigned long long)total_needed,
3003 (unsigned long long)num_bytes,
3004 loop, allowed_chunk_alloc);
3005 ret = -ENOSPC;
3006 }
3007 if (block_group)
3008 put_block_group(block_group);
3009
3010 up_read(&space_info->groups_sem);
3011 return ret;
3012}
3013
3014static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
3015{
3016 struct btrfs_block_group_cache *cache;
3017 struct list_head *l;
3018
3019 printk(KERN_INFO "space_info has %llu free, is %sfull\n",
3020 (unsigned long long)(info->total_bytes - info->bytes_used -
3021 info->bytes_pinned - info->bytes_reserved),
3022 (info->full) ? "" : "not ");
3023
3024 down_read(&info->groups_sem);
3025 list_for_each(l, &info->block_groups) {
3026 cache = list_entry(l, struct btrfs_block_group_cache, list);
3027 spin_lock(&cache->lock);
3028 printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
3029 "%llu pinned %llu reserved\n",
3030 (unsigned long long)cache->key.objectid,
3031 (unsigned long long)cache->key.offset,
3032 (unsigned long long)btrfs_block_group_used(&cache->item),
3033 (unsigned long long)cache->pinned,
3034 (unsigned long long)cache->reserved);
3035 btrfs_dump_free_space(cache, bytes);
3036 spin_unlock(&cache->lock);
3037 }
3038 up_read(&info->groups_sem);
3039}
3040
3041static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
3042 struct btrfs_root *root,
3043 u64 num_bytes, u64 min_alloc_size,
3044 u64 empty_size, u64 hint_byte,
3045 u64 search_end, struct btrfs_key *ins,
3046 u64 data)
3047{
3048 int ret;
3049 u64 search_start = 0;
3050 u64 alloc_profile;
3051 struct btrfs_fs_info *info = root->fs_info;
3052
3053 if (data) {
3054 alloc_profile = info->avail_data_alloc_bits &
3055 info->data_alloc_profile;
3056 data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
3057 } else if (root == root->fs_info->chunk_root) {
3058 alloc_profile = info->avail_system_alloc_bits &
3059 info->system_alloc_profile;
3060 data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
3061 } else {
3062 alloc_profile = info->avail_metadata_alloc_bits &
3063 info->metadata_alloc_profile;
3064 data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
3065 }
3066again:
3067 data = btrfs_reduce_alloc_profile(root, data);
3068 /*
3069 * the only place that sets empty_size is btrfs_realloc_node, which
3070 * is not called recursively on allocations
3071 */
3072 if (empty_size || root->ref_cows) {
3073 if (!(data & BTRFS_BLOCK_GROUP_METADATA)) {
3074 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3075 2 * 1024 * 1024,
3076 BTRFS_BLOCK_GROUP_METADATA |
3077 (info->metadata_alloc_profile &
3078 info->avail_metadata_alloc_bits), 0);
3079 }
3080 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3081 num_bytes + 2 * 1024 * 1024, data, 0);
3082 }
3083
3084 WARN_ON(num_bytes < root->sectorsize);
3085 ret = find_free_extent(trans, root, num_bytes, empty_size,
3086 search_start, search_end, hint_byte, ins,
3087 trans->alloc_exclude_start,
3088 trans->alloc_exclude_nr, data);
3089
3090 if (ret == -ENOSPC && num_bytes > min_alloc_size) {
3091 num_bytes = num_bytes >> 1;
3092 num_bytes = num_bytes & ~(root->sectorsize - 1);
3093 num_bytes = max(num_bytes, min_alloc_size);
3094 do_chunk_alloc(trans, root->fs_info->extent_root,
3095 num_bytes, data, 1);
3096 goto again;
3097 }
3098 if (ret) {
3099 struct btrfs_space_info *sinfo;
3100
3101 sinfo = __find_space_info(root->fs_info, data);
3102 printk(KERN_ERR "btrfs allocation failed flags %llu, "
3103 "wanted %llu\n", (unsigned long long)data,
3104 (unsigned long long)num_bytes);
3105 dump_space_info(sinfo, num_bytes);
3106 BUG();
3107 }
3108
3109 return ret;
3110}
3111
3112int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
3113{
3114 struct btrfs_block_group_cache *cache;
3115 int ret = 0;
3116
3117 cache = btrfs_lookup_block_group(root->fs_info, start);
3118 if (!cache) {
3119 printk(KERN_ERR "Unable to find block group for %llu\n",
3120 (unsigned long long)start);
3121 return -ENOSPC;
3122 }
3123
3124 ret = btrfs_discard_extent(root, start, len);
3125
3126 btrfs_add_free_space(cache, start, len);
3127 put_block_group(cache);
3128 update_reserved_extents(root, start, len, 0);
3129
3130 return ret;
3131}
3132
3133int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
3134 struct btrfs_root *root,
3135 u64 num_bytes, u64 min_alloc_size,
3136 u64 empty_size, u64 hint_byte,
3137 u64 search_end, struct btrfs_key *ins,
3138 u64 data)
3139{
3140 int ret;
3141 ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
3142 empty_size, hint_byte, search_end, ins,
3143 data);
3144 update_reserved_extents(root, ins->objectid, ins->offset, 1);
3145 return ret;
3146}
3147
3148static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
3149 struct btrfs_root *root, u64 parent,
3150 u64 root_objectid, u64 ref_generation,
3151 u64 owner, struct btrfs_key *ins)
3152{
3153 int ret;
3154 int pending_ret;
3155 u64 super_used;
3156 u64 root_used;
3157 u64 num_bytes = ins->offset;
3158 u32 sizes[2];
3159 struct btrfs_fs_info *info = root->fs_info;
3160 struct btrfs_root *extent_root = info->extent_root;
3161 struct btrfs_extent_item *extent_item;
3162 struct btrfs_extent_ref *ref;
3163 struct btrfs_path *path;
3164 struct btrfs_key keys[2];
3165
3166 if (parent == 0)
3167 parent = ins->objectid;
3168
3169 /* block accounting for super block */
3170 spin_lock(&info->delalloc_lock);
3171 super_used = btrfs_super_bytes_used(&info->super_copy);
3172 btrfs_set_super_bytes_used(&info->super_copy, super_used + num_bytes);
3173
3174 /* block accounting for root item */
3175 root_used = btrfs_root_used(&root->root_item);
3176 btrfs_set_root_used(&root->root_item, root_used + num_bytes);
3177 spin_unlock(&info->delalloc_lock);
3178
3179 if (root == extent_root) {
3180 struct pending_extent_op *extent_op;
3181
3182 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
3183 BUG_ON(!extent_op);
3184
3185 extent_op->type = PENDING_EXTENT_INSERT;
3186 extent_op->bytenr = ins->objectid;
3187 extent_op->num_bytes = ins->offset;
3188 extent_op->parent = parent;
3189 extent_op->orig_parent = 0;
3190 extent_op->generation = ref_generation;
3191 extent_op->orig_generation = 0;
3192 extent_op->level = (int)owner;
3193 INIT_LIST_HEAD(&extent_op->list);
3194 extent_op->del = 0;
3195
3196 mutex_lock(&root->fs_info->extent_ins_mutex);
3197 set_extent_bits(&root->fs_info->extent_ins, ins->objectid,
3198 ins->objectid + ins->offset - 1,
3199 EXTENT_WRITEBACK, GFP_NOFS);
3200 set_state_private(&root->fs_info->extent_ins,
3201 ins->objectid, (unsigned long)extent_op);
3202 mutex_unlock(&root->fs_info->extent_ins_mutex);
3203 goto update_block;
3204 }
3205
3206 memcpy(&keys[0], ins, sizeof(*ins));
3207 keys[1].objectid = ins->objectid;
3208 keys[1].type = BTRFS_EXTENT_REF_KEY;
3209 keys[1].offset = parent;
3210 sizes[0] = sizeof(*extent_item);
3211 sizes[1] = sizeof(*ref);
3212
3213 path = btrfs_alloc_path();
3214 BUG_ON(!path);
3215
3216 ret = btrfs_insert_empty_items(trans, extent_root, path, keys,
3217 sizes, 2);
3218 BUG_ON(ret);
3219
3220 extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3221 struct btrfs_extent_item);
3222 btrfs_set_extent_refs(path->nodes[0], extent_item, 1);
3223 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
3224 struct btrfs_extent_ref);
3225
3226 btrfs_set_ref_root(path->nodes[0], ref, root_objectid);
3227 btrfs_set_ref_generation(path->nodes[0], ref, ref_generation);
3228 btrfs_set_ref_objectid(path->nodes[0], ref, owner);
3229 btrfs_set_ref_num_refs(path->nodes[0], ref, 1);
3230
3231 btrfs_mark_buffer_dirty(path->nodes[0]);
3232
3233 trans->alloc_exclude_start = 0;
3234 trans->alloc_exclude_nr = 0;
3235 btrfs_free_path(path);
3236 finish_current_insert(trans, extent_root, 0);
3237 pending_ret = del_pending_extents(trans, extent_root, 0);
3238
3239 if (ret)
3240 goto out;
3241 if (pending_ret) {
3242 ret = pending_ret;
3243 goto out;
3244 }
3245
3246update_block:
3247 ret = update_block_group(trans, root, ins->objectid,
3248 ins->offset, 1, 0);
3249 if (ret) {
3250 printk(KERN_ERR "btrfs update block group failed for %llu "
3251 "%llu\n", (unsigned long long)ins->objectid,
3252 (unsigned long long)ins->offset);
3253 BUG();
3254 }
3255out:
3256 return ret;
3257}
3258
3259int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
3260 struct btrfs_root *root, u64 parent,
3261 u64 root_objectid, u64 ref_generation,
3262 u64 owner, struct btrfs_key *ins)
3263{
3264 int ret;
3265
3266 if (root_objectid == BTRFS_TREE_LOG_OBJECTID)
3267 return 0;
3268 ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
3269 ref_generation, owner, ins);
3270 update_reserved_extents(root, ins->objectid, ins->offset, 0);
3271 return ret;
3272}
3273
3274/*
3275 * this is used by the tree logging recovery code. It records that
3276 * an extent has been allocated and makes sure to clear the free
3277 * space cache bits as well
3278 */
3279int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
3280 struct btrfs_root *root, u64 parent,
3281 u64 root_objectid, u64 ref_generation,
3282 u64 owner, struct btrfs_key *ins)
3283{
3284 int ret;
3285 struct btrfs_block_group_cache *block_group;
3286
3287 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
3288 mutex_lock(&block_group->cache_mutex);
3289 cache_block_group(root, block_group);
3290 mutex_unlock(&block_group->cache_mutex);
3291
3292 ret = btrfs_remove_free_space(block_group, ins->objectid,
3293 ins->offset);
3294 BUG_ON(ret);
3295 put_block_group(block_group);
3296 ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
3297 ref_generation, owner, ins);
3298 return ret;
3299}
3300
3301/*
3302 * finds a free extent and does all the dirty work required for allocation
3303 * returns the key for the extent through ins, and a tree buffer for
3304 * the first block of the extent through buf.
3305 *
3306 * returns 0 if everything worked, non-zero otherwise.
3307 */
3308int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
3309 struct btrfs_root *root,
3310 u64 num_bytes, u64 parent, u64 min_alloc_size,
3311 u64 root_objectid, u64 ref_generation,
3312 u64 owner_objectid, u64 empty_size, u64 hint_byte,
3313 u64 search_end, struct btrfs_key *ins, u64 data)
3314{
3315 int ret;
3316
3317 ret = __btrfs_reserve_extent(trans, root, num_bytes,
3318 min_alloc_size, empty_size, hint_byte,
3319 search_end, ins, data);
3320 BUG_ON(ret);
3321 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
3322 ret = __btrfs_alloc_reserved_extent(trans, root, parent,
3323 root_objectid, ref_generation,
3324 owner_objectid, ins);
3325 BUG_ON(ret);
3326
3327 } else {
3328 update_reserved_extents(root, ins->objectid, ins->offset, 1);
3329 }
3330 return ret;
3331}
3332
3333struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
3334 struct btrfs_root *root,
3335 u64 bytenr, u32 blocksize)
3336{
3337 struct extent_buffer *buf;
3338
3339 buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
3340 if (!buf)
3341 return ERR_PTR(-ENOMEM);
3342 btrfs_set_header_generation(buf, trans->transid);
3343 btrfs_tree_lock(buf);
3344 clean_tree_block(trans, root, buf);
3345 btrfs_set_buffer_uptodate(buf);
3346 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
3347 set_extent_dirty(&root->dirty_log_pages, buf->start,
3348 buf->start + buf->len - 1, GFP_NOFS);
3349 } else {
3350 set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
3351 buf->start + buf->len - 1, GFP_NOFS);
3352 }
3353 trans->blocks_used++;
3354 return buf;
3355}
3356
3357/*
3358 * helper function to allocate a block for a given tree
3359 * returns the tree buffer or NULL.
3360 */
3361struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
3362 struct btrfs_root *root,
3363 u32 blocksize, u64 parent,
3364 u64 root_objectid,
3365 u64 ref_generation,
3366 int level,
3367 u64 hint,
3368 u64 empty_size)
3369{
3370 struct btrfs_key ins;
3371 int ret;
3372 struct extent_buffer *buf;
3373
3374 ret = btrfs_alloc_extent(trans, root, blocksize, parent, blocksize,
3375 root_objectid, ref_generation, level,
3376 empty_size, hint, (u64)-1, &ins, 0);
3377 if (ret) {
3378 BUG_ON(ret > 0);
3379 return ERR_PTR(ret);
3380 }
3381
3382 buf = btrfs_init_new_buffer(trans, root, ins.objectid, blocksize);
3383 return buf;
3384}
3385
3386int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
3387 struct btrfs_root *root, struct extent_buffer *leaf)
3388{
3389 u64 leaf_owner;
3390 u64 leaf_generation;
3391 struct btrfs_key key;
3392 struct btrfs_file_extent_item *fi;
3393 int i;
3394 int nritems;
3395 int ret;
3396
3397 BUG_ON(!btrfs_is_leaf(leaf));
3398 nritems = btrfs_header_nritems(leaf);
3399 leaf_owner = btrfs_header_owner(leaf);
3400 leaf_generation = btrfs_header_generation(leaf);
3401
3402 for (i = 0; i < nritems; i++) {
3403 u64 disk_bytenr;
3404 cond_resched();
3405
3406 btrfs_item_key_to_cpu(leaf, &key, i);
3407 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
3408 continue;
3409 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
3410 if (btrfs_file_extent_type(leaf, fi) ==
3411 BTRFS_FILE_EXTENT_INLINE)
3412 continue;
3413 /*
3414 * FIXME make sure to insert a trans record that
3415 * repeats the snapshot del on crash
3416 */
3417 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
3418 if (disk_bytenr == 0)
3419 continue;
3420
3421 ret = __btrfs_free_extent(trans, root, disk_bytenr,
3422 btrfs_file_extent_disk_num_bytes(leaf, fi),
3423 leaf->start, leaf_owner, leaf_generation,
3424 key.objectid, 0);
3425 BUG_ON(ret);
3426
3427 atomic_inc(&root->fs_info->throttle_gen);
3428 wake_up(&root->fs_info->transaction_throttle);
3429 cond_resched();
3430 }
3431 return 0;
3432}
3433
3434static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
3435 struct btrfs_root *root,
3436 struct btrfs_leaf_ref *ref)
3437{
3438 int i;
3439 int ret;
3440 struct btrfs_extent_info *info = ref->extents;
3441
3442 for (i = 0; i < ref->nritems; i++) {
3443 ret = __btrfs_free_extent(trans, root, info->bytenr,
3444 info->num_bytes, ref->bytenr,
3445 ref->owner, ref->generation,
3446 info->objectid, 0);
3447
3448 atomic_inc(&root->fs_info->throttle_gen);
3449 wake_up(&root->fs_info->transaction_throttle);
3450 cond_resched();
3451
3452 BUG_ON(ret);
3453 info++;
3454 }
3455
3456 return 0;
3457}
3458
3459static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start,
3460 u64 len, u32 *refs)
3461{
3462 int ret;
3463
3464 ret = btrfs_lookup_extent_ref(NULL, root, start, len, refs);
3465 BUG_ON(ret);
3466
3467#if 0 /* some debugging code in case we see problems here */
3468 /* if the refs count is one, it won't get increased again. But
3469 * if the ref count is > 1, someone may be decreasing it at
3470 * the same time we are.
3471 */
3472 if (*refs != 1) {
3473 struct extent_buffer *eb = NULL;
3474 eb = btrfs_find_create_tree_block(root, start, len);
3475 if (eb)
3476 btrfs_tree_lock(eb);
3477
3478 mutex_lock(&root->fs_info->alloc_mutex);
3479 ret = lookup_extent_ref(NULL, root, start, len, refs);
3480 BUG_ON(ret);
3481 mutex_unlock(&root->fs_info->alloc_mutex);
3482
3483 if (eb) {
3484 btrfs_tree_unlock(eb);
3485 free_extent_buffer(eb);
3486 }
3487 if (*refs == 1) {
3488 printk(KERN_ERR "btrfs block %llu went down to one "
3489 "during drop_snap\n", (unsigned long long)start);
3490 }
3491
3492 }
3493#endif
3494
3495 cond_resched();
3496 return ret;
3497}
3498
3499/*
3500 * helper function for drop_snapshot, this walks down the tree dropping ref
3501 * counts as it goes.
3502 */
3503static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
3504 struct btrfs_root *root,
3505 struct btrfs_path *path, int *level)
3506{
3507 u64 root_owner;
3508 u64 root_gen;
3509 u64 bytenr;
3510 u64 ptr_gen;
3511 struct extent_buffer *next;
3512 struct extent_buffer *cur;
3513 struct extent_buffer *parent;
3514 struct btrfs_leaf_ref *ref;
3515 u32 blocksize;
3516 int ret;
3517 u32 refs;
3518
3519 WARN_ON(*level < 0);
3520 WARN_ON(*level >= BTRFS_MAX_LEVEL);
3521 ret = drop_snap_lookup_refcount(root, path->nodes[*level]->start,
3522 path->nodes[*level]->len, &refs);
3523 BUG_ON(ret);
3524 if (refs > 1)
3525 goto out;
3526
3527 /*
3528 * walk down to the last node level and free all the leaves
3529 */
3530 while (*level >= 0) {
3531 WARN_ON(*level < 0);
3532 WARN_ON(*level >= BTRFS_MAX_LEVEL);
3533 cur = path->nodes[*level];
3534
3535 if (btrfs_header_level(cur) != *level)
3536 WARN_ON(1);
3537
3538 if (path->slots[*level] >=
3539 btrfs_header_nritems(cur))
3540 break;
3541 if (*level == 0) {
3542 ret = btrfs_drop_leaf_ref(trans, root, cur);
3543 BUG_ON(ret);
3544 break;
3545 }
3546 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
3547 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
3548 blocksize = btrfs_level_size(root, *level - 1);
3549
3550 ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
3551 BUG_ON(ret);
3552 if (refs != 1) {
3553 parent = path->nodes[*level];
3554 root_owner = btrfs_header_owner(parent);
3555 root_gen = btrfs_header_generation(parent);
3556 path->slots[*level]++;
3557
3558 ret = __btrfs_free_extent(trans, root, bytenr,
3559 blocksize, parent->start,
3560 root_owner, root_gen,
3561 *level - 1, 1);
3562 BUG_ON(ret);
3563
3564 atomic_inc(&root->fs_info->throttle_gen);
3565 wake_up(&root->fs_info->transaction_throttle);
3566 cond_resched();
3567
3568 continue;
3569 }
3570 /*
3571 * at this point, we have a single ref, and since the
3572 * only place referencing this extent is a dead root
3573 * the reference count should never go higher.
3574 * So, we don't need to check it again
3575 */
3576 if (*level == 1) {
3577 ref = btrfs_lookup_leaf_ref(root, bytenr);
3578 if (ref && ref->generation != ptr_gen) {
3579 btrfs_free_leaf_ref(root, ref);
3580 ref = NULL;
3581 }
3582 if (ref) {
3583 ret = cache_drop_leaf_ref(trans, root, ref);
3584 BUG_ON(ret);
3585 btrfs_remove_leaf_ref(root, ref);
3586 btrfs_free_leaf_ref(root, ref);
3587 *level = 0;
3588 break;
3589 }
3590 }
3591 next = btrfs_find_tree_block(root, bytenr, blocksize);
3592 if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) {
3593 free_extent_buffer(next);
3594
3595 next = read_tree_block(root, bytenr, blocksize,
3596 ptr_gen);
3597 cond_resched();
3598#if 0
3599 /*
3600 * this is a debugging check and can go away
3601 * the ref should never go all the way down to 1
3602 * at this point
3603 */
3604 ret = lookup_extent_ref(NULL, root, bytenr, blocksize,
3605 &refs);
3606 BUG_ON(ret);
3607 WARN_ON(refs != 1);
3608#endif
3609 }
3610 WARN_ON(*level <= 0);
3611 if (path->nodes[*level-1])
3612 free_extent_buffer(path->nodes[*level-1]);
3613 path->nodes[*level-1] = next;
3614 *level = btrfs_header_level(next);
3615 path->slots[*level] = 0;
3616 cond_resched();
3617 }
3618out:
3619 WARN_ON(*level < 0);
3620 WARN_ON(*level >= BTRFS_MAX_LEVEL);
3621
3622 if (path->nodes[*level] == root->node) {
3623 parent = path->nodes[*level];
3624 bytenr = path->nodes[*level]->start;
3625 } else {
3626 parent = path->nodes[*level + 1];
3627 bytenr = btrfs_node_blockptr(parent, path->slots[*level + 1]);
3628 }
3629
3630 blocksize = btrfs_level_size(root, *level);
3631 root_owner = btrfs_header_owner(parent);
3632 root_gen = btrfs_header_generation(parent);
3633
3634 ret = __btrfs_free_extent(trans, root, bytenr, blocksize,
3635 parent->start, root_owner, root_gen,
3636 *level, 1);
3637 free_extent_buffer(path->nodes[*level]);
3638 path->nodes[*level] = NULL;
3639 *level += 1;
3640 BUG_ON(ret);
3641
3642 cond_resched();
3643 return 0;
3644}
3645
3646/*
3647 * helper function for drop_subtree, this function is similar to
3648 * walk_down_tree. The main difference is that it checks reference
3649 * counts while tree blocks are locked.
3650 */
3651static noinline int walk_down_subtree(struct btrfs_trans_handle *trans,
3652 struct btrfs_root *root,
3653 struct btrfs_path *path, int *level)
3654{
3655 struct extent_buffer *next;
3656 struct extent_buffer *cur;
3657 struct extent_buffer *parent;
3658 u64 bytenr;
3659 u64 ptr_gen;
3660 u32 blocksize;
3661 u32 refs;
3662 int ret;
3663
3664 cur = path->nodes[*level];
3665 ret = btrfs_lookup_extent_ref(trans, root, cur->start, cur->len,
3666 &refs);
3667 BUG_ON(ret);
3668 if (refs > 1)
3669 goto out;
3670
3671 while (*level >= 0) {
3672 cur = path->nodes[*level];
3673 if (*level == 0) {
3674 ret = btrfs_drop_leaf_ref(trans, root, cur);
3675 BUG_ON(ret);
3676 clean_tree_block(trans, root, cur);
3677 break;
3678 }
3679 if (path->slots[*level] >= btrfs_header_nritems(cur)) {
3680 clean_tree_block(trans, root, cur);
3681 break;
3682 }
3683
3684 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
3685 blocksize = btrfs_level_size(root, *level - 1);
3686 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
3687
3688 next = read_tree_block(root, bytenr, blocksize, ptr_gen);
3689 btrfs_tree_lock(next);
3690
3691 ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize,
3692 &refs);
3693 BUG_ON(ret);
3694 if (refs > 1) {
3695 parent = path->nodes[*level];
3696 ret = btrfs_free_extent(trans, root, bytenr,
3697 blocksize, parent->start,
3698 btrfs_header_owner(parent),
3699 btrfs_header_generation(parent),
3700 *level - 1, 1);
3701 BUG_ON(ret);
3702 path->slots[*level]++;
3703 btrfs_tree_unlock(next);
3704 free_extent_buffer(next);
3705 continue;
3706 }
3707
3708 *level = btrfs_header_level(next);
3709 path->nodes[*level] = next;
3710 path->slots[*level] = 0;
3711 path->locks[*level] = 1;
3712 cond_resched();
3713 }
3714out:
3715 parent = path->nodes[*level + 1];
3716 bytenr = path->nodes[*level]->start;
3717 blocksize = path->nodes[*level]->len;
3718
3719 ret = btrfs_free_extent(trans, root, bytenr, blocksize,
3720 parent->start, btrfs_header_owner(parent),
3721 btrfs_header_generation(parent), *level, 1);
3722 BUG_ON(ret);
3723
3724 if (path->locks[*level]) {
3725 btrfs_tree_unlock(path->nodes[*level]);
3726 path->locks[*level] = 0;
3727 }
3728 free_extent_buffer(path->nodes[*level]);
3729 path->nodes[*level] = NULL;
3730 *level += 1;
3731 cond_resched();
3732 return 0;
3733}
3734
3735/*
3736 * helper for dropping snapshots. This walks back up the tree in the path
3737 * to find the first node higher up where we haven't yet gone through
3738 * all the slots
3739 */
3740static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
3741 struct btrfs_root *root,
3742 struct btrfs_path *path,
3743 int *level, int max_level)
3744{
3745 u64 root_owner;
3746 u64 root_gen;
3747 struct btrfs_root_item *root_item = &root->root_item;
3748 int i;
3749 int slot;
3750 int ret;
3751
3752 for (i = *level; i < max_level && path->nodes[i]; i++) {
3753 slot = path->slots[i];
3754 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
3755 struct extent_buffer *node;
3756 struct btrfs_disk_key disk_key;
3757 node = path->nodes[i];
3758 path->slots[i]++;
3759 *level = i;
3760 WARN_ON(*level == 0);
3761 btrfs_node_key(node, &disk_key, path->slots[i]);
3762 memcpy(&root_item->drop_progress,
3763 &disk_key, sizeof(disk_key));
3764 root_item->drop_level = i;
3765 return 0;
3766 } else {
3767 struct extent_buffer *parent;
3768 if (path->nodes[*level] == root->node)
3769 parent = path->nodes[*level];
3770 else
3771 parent = path->nodes[*level + 1];
3772
3773 root_owner = btrfs_header_owner(parent);
3774 root_gen = btrfs_header_generation(parent);
3775
3776 clean_tree_block(trans, root, path->nodes[*level]);
3777 ret = btrfs_free_extent(trans, root,
3778 path->nodes[*level]->start,
3779 path->nodes[*level]->len,
3780 parent->start, root_owner,
3781 root_gen, *level, 1);
3782 BUG_ON(ret);
3783 if (path->locks[*level]) {
3784 btrfs_tree_unlock(path->nodes[*level]);
3785 path->locks[*level] = 0;
3786 }
3787 free_extent_buffer(path->nodes[*level]);
3788 path->nodes[*level] = NULL;
3789 *level = i + 1;
3790 }
3791 }
3792 return 1;
3793}
3794
3795/*
3796 * drop the reference count on the tree rooted at 'snap'. This traverses
3797 * the tree freeing any blocks that have a ref count of zero after being
3798 * decremented.
3799 */
3800int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
3801 *root)
3802{
3803 int ret = 0;
3804 int wret;
3805 int level;
3806 struct btrfs_path *path;
3807 int i;
3808 int orig_level;
3809 struct btrfs_root_item *root_item = &root->root_item;
3810
3811 WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex));
3812 path = btrfs_alloc_path();
3813 BUG_ON(!path);
3814
3815 level = btrfs_header_level(root->node);
3816 orig_level = level;
3817 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
3818 path->nodes[level] = root->node;
3819 extent_buffer_get(root->node);
3820 path->slots[level] = 0;
3821 } else {
3822 struct btrfs_key key;
3823 struct btrfs_disk_key found_key;
3824 struct extent_buffer *node;
3825
3826 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
3827 level = root_item->drop_level;
3828 path->lowest_level = level;
3829 wret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3830 if (wret < 0) {
3831 ret = wret;
3832 goto out;
3833 }
3834 node = path->nodes[level];
3835 btrfs_node_key(node, &found_key, path->slots[level]);
3836 WARN_ON(memcmp(&found_key, &root_item->drop_progress,
3837 sizeof(found_key)));
3838 /*
3839 * unlock our path, this is safe because only this
3840 * function is allowed to delete this snapshot
3841 */
3842 for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
3843 if (path->nodes[i] && path->locks[i]) {
3844 path->locks[i] = 0;
3845 btrfs_tree_unlock(path->nodes[i]);
3846 }
3847 }
3848 }
3849 while (1) {
3850 wret = walk_down_tree(trans, root, path, &level);
3851 if (wret > 0)
3852 break;
3853 if (wret < 0)
3854 ret = wret;
3855
3856 wret = walk_up_tree(trans, root, path, &level,
3857 BTRFS_MAX_LEVEL);
3858 if (wret > 0)
3859 break;
3860 if (wret < 0)
3861 ret = wret;
3862 if (trans->transaction->in_commit) {
3863 ret = -EAGAIN;
3864 break;
3865 }
3866 atomic_inc(&root->fs_info->throttle_gen);
3867 wake_up(&root->fs_info->transaction_throttle);
3868 }
3869 for (i = 0; i <= orig_level; i++) {
3870 if (path->nodes[i]) {
3871 free_extent_buffer(path->nodes[i]);
3872 path->nodes[i] = NULL;
3873 }
3874 }
3875out:
3876 btrfs_free_path(path);
3877 return ret;
3878}
3879
3880int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
3881 struct btrfs_root *root,
3882 struct extent_buffer *node,
3883 struct extent_buffer *parent)
3884{
3885 struct btrfs_path *path;
3886 int level;
3887 int parent_level;
3888 int ret = 0;
3889 int wret;
3890
3891 path = btrfs_alloc_path();
3892 BUG_ON(!path);
3893
3894 BUG_ON(!btrfs_tree_locked(parent));
3895 parent_level = btrfs_header_level(parent);
3896 extent_buffer_get(parent);
3897 path->nodes[parent_level] = parent;
3898 path->slots[parent_level] = btrfs_header_nritems(parent);
3899
3900 BUG_ON(!btrfs_tree_locked(node));
3901 level = btrfs_header_level(node);
3902 extent_buffer_get(node);
3903 path->nodes[level] = node;
3904 path->slots[level] = 0;
3905
3906 while (1) {
3907 wret = walk_down_subtree(trans, root, path, &level);
3908 if (wret < 0)
3909 ret = wret;
3910 if (wret != 0)
3911 break;
3912
3913 wret = walk_up_tree(trans, root, path, &level, parent_level);
3914 if (wret < 0)
3915 ret = wret;
3916 if (wret != 0)
3917 break;
3918 }
3919
3920 btrfs_free_path(path);
3921 return ret;
3922}
3923
3924static unsigned long calc_ra(unsigned long start, unsigned long last,
3925 unsigned long nr)
3926{
3927 return min(last, start + nr - 1);
3928}
3929
3930static noinline int relocate_inode_pages(struct inode *inode, u64 start,
3931 u64 len)
3932{
3933 u64 page_start;
3934 u64 page_end;
3935 unsigned long first_index;
3936 unsigned long last_index;
3937 unsigned long i;
3938 struct page *page;
3939 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3940 struct file_ra_state *ra;
3941 struct btrfs_ordered_extent *ordered;
3942 unsigned int total_read = 0;
3943 unsigned int total_dirty = 0;
3944 int ret = 0;
3945
3946 ra = kzalloc(sizeof(*ra), GFP_NOFS);
3947
3948 mutex_lock(&inode->i_mutex);
3949 first_index = start >> PAGE_CACHE_SHIFT;
3950 last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
3951
3952 /* make sure the dirty trick played by the caller work */
3953 ret = invalidate_inode_pages2_range(inode->i_mapping,
3954 first_index, last_index);
3955 if (ret)
3956 goto out_unlock;
3957
3958 file_ra_state_init(ra, inode->i_mapping);
3959
3960 for (i = first_index ; i <= last_index; i++) {
3961 if (total_read % ra->ra_pages == 0) {
3962 btrfs_force_ra(inode->i_mapping, ra, NULL, i,
3963 calc_ra(i, last_index, ra->ra_pages));
3964 }
3965 total_read++;
3966again:
3967 if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
3968 BUG_ON(1);
3969 page = grab_cache_page(inode->i_mapping, i);
3970 if (!page) {
3971 ret = -ENOMEM;
3972 goto out_unlock;
3973 }
3974 if (!PageUptodate(page)) {
3975 btrfs_readpage(NULL, page);
3976 lock_page(page);
3977 if (!PageUptodate(page)) {
3978 unlock_page(page);
3979 page_cache_release(page);
3980 ret = -EIO;
3981 goto out_unlock;
3982 }
3983 }
3984 wait_on_page_writeback(page);
3985
3986 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
3987 page_end = page_start + PAGE_CACHE_SIZE - 1;
3988 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
3989
3990 ordered = btrfs_lookup_ordered_extent(inode, page_start);
3991 if (ordered) {
3992 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
3993 unlock_page(page);
3994 page_cache_release(page);
3995 btrfs_start_ordered_extent(inode, ordered, 1);
3996 btrfs_put_ordered_extent(ordered);
3997 goto again;
3998 }
3999 set_page_extent_mapped(page);
4000
4001 if (i == first_index)
4002 set_extent_bits(io_tree, page_start, page_end,
4003 EXTENT_BOUNDARY, GFP_NOFS);
4004 btrfs_set_extent_delalloc(inode, page_start, page_end);
4005
4006 set_page_dirty(page);
4007 total_dirty++;
4008
4009 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
4010 unlock_page(page);
4011 page_cache_release(page);
4012 }
4013
4014out_unlock:
4015 kfree(ra);
4016 mutex_unlock(&inode->i_mutex);
4017 balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty);
4018 return ret;
4019}
4020
4021static noinline int relocate_data_extent(struct inode *reloc_inode,
4022 struct btrfs_key *extent_key,
4023 u64 offset)
4024{
4025 struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
4026 struct extent_map_tree *em_tree = &BTRFS_I(reloc_inode)->extent_tree;
4027 struct extent_map *em;
4028 u64 start = extent_key->objectid - offset;
4029 u64 end = start + extent_key->offset - 1;
4030
4031 em = alloc_extent_map(GFP_NOFS);
4032 BUG_ON(!em || IS_ERR(em));
4033
4034 em->start = start;
4035 em->len = extent_key->offset;
4036 em->block_len = extent_key->offset;
4037 em->block_start = extent_key->objectid;
4038 em->bdev = root->fs_info->fs_devices->latest_bdev;
4039 set_bit(EXTENT_FLAG_PINNED, &em->flags);
4040
4041 /* setup extent map to cheat btrfs_readpage */
4042 lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
4043 while (1) {
4044 int ret;
4045 spin_lock(&em_tree->lock);
4046 ret = add_extent_mapping(em_tree, em);
4047 spin_unlock(&em_tree->lock);
4048 if (ret != -EEXIST) {
4049 free_extent_map(em);
4050 break;
4051 }
4052 btrfs_drop_extent_cache(reloc_inode, start, end, 0);
4053 }
4054 unlock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
4055
4056 return relocate_inode_pages(reloc_inode, start, extent_key->offset);
4057}
4058
4059struct btrfs_ref_path {
4060 u64 extent_start;
4061 u64 nodes[BTRFS_MAX_LEVEL];
4062 u64 root_objectid;
4063 u64 root_generation;
4064 u64 owner_objectid;
4065 u32 num_refs;
4066 int lowest_level;
4067 int current_level;
4068 int shared_level;
4069
4070 struct btrfs_key node_keys[BTRFS_MAX_LEVEL];
4071 u64 new_nodes[BTRFS_MAX_LEVEL];
4072};
4073
4074struct disk_extent {
4075 u64 ram_bytes;
4076 u64 disk_bytenr;
4077 u64 disk_num_bytes;
4078 u64 offset;
4079 u64 num_bytes;
4080 u8 compression;
4081 u8 encryption;
4082 u16 other_encoding;
4083};
4084
4085static int is_cowonly_root(u64 root_objectid)
4086{
4087 if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
4088 root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
4089 root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
4090 root_objectid == BTRFS_DEV_TREE_OBJECTID ||
4091 root_objectid == BTRFS_TREE_LOG_OBJECTID ||
4092 root_objectid == BTRFS_CSUM_TREE_OBJECTID)
4093 return 1;
4094 return 0;
4095}
4096
4097static noinline int __next_ref_path(struct btrfs_trans_handle *trans,
4098 struct btrfs_root *extent_root,
4099 struct btrfs_ref_path *ref_path,
4100 int first_time)
4101{
4102 struct extent_buffer *leaf;
4103 struct btrfs_path *path;
4104 struct btrfs_extent_ref *ref;
4105 struct btrfs_key key;
4106 struct btrfs_key found_key;
4107 u64 bytenr;
4108 u32 nritems;
4109 int level;
4110 int ret = 1;
4111
4112 path = btrfs_alloc_path();
4113 if (!path)
4114 return -ENOMEM;
4115
4116 if (first_time) {
4117 ref_path->lowest_level = -1;
4118 ref_path->current_level = -1;
4119 ref_path->shared_level = -1;
4120 goto walk_up;
4121 }
4122walk_down:
4123 level = ref_path->current_level - 1;
4124 while (level >= -1) {
4125 u64 parent;
4126 if (level < ref_path->lowest_level)
4127 break;
4128
4129 if (level >= 0)
4130 bytenr = ref_path->nodes[level];
4131 else
4132 bytenr = ref_path->extent_start;
4133 BUG_ON(bytenr == 0);
4134
4135 parent = ref_path->nodes[level + 1];
4136 ref_path->nodes[level + 1] = 0;
4137 ref_path->current_level = level;
4138 BUG_ON(parent == 0);
4139
4140 key.objectid = bytenr;
4141 key.offset = parent + 1;
4142 key.type = BTRFS_EXTENT_REF_KEY;
4143
4144 ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
4145 if (ret < 0)
4146 goto out;
4147 BUG_ON(ret == 0);
4148
4149 leaf = path->nodes[0];
4150 nritems = btrfs_header_nritems(leaf);
4151 if (path->slots[0] >= nritems) {
4152 ret = btrfs_next_leaf(extent_root, path);
4153 if (ret < 0)
4154 goto out;
4155 if (ret > 0)
4156 goto next;
4157 leaf = path->nodes[0];
4158 }
4159
4160 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4161 if (found_key.objectid == bytenr &&
4162 found_key.type == BTRFS_EXTENT_REF_KEY) {
4163 if (level < ref_path->shared_level)
4164 ref_path->shared_level = level;
4165 goto found;
4166 }
4167next:
4168 level--;
4169 btrfs_release_path(extent_root, path);
4170 cond_resched();
4171 }
4172 /* reached lowest level */
4173 ret = 1;
4174 goto out;
4175walk_up:
4176 level = ref_path->current_level;
4177 while (level < BTRFS_MAX_LEVEL - 1) {
4178 u64 ref_objectid;
4179
4180 if (level >= 0)
4181 bytenr = ref_path->nodes[level];
4182 else
4183 bytenr = ref_path->extent_start;
4184
4185 BUG_ON(bytenr == 0);
4186
4187 key.objectid = bytenr;
4188 key.offset = 0;
4189 key.type = BTRFS_EXTENT_REF_KEY;
4190
4191 ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
4192 if (ret < 0)
4193 goto out;
4194
4195 leaf = path->nodes[0];
4196 nritems = btrfs_header_nritems(leaf);
4197 if (path->slots[0] >= nritems) {
4198 ret = btrfs_next_leaf(extent_root, path);
4199 if (ret < 0)
4200 goto out;
4201 if (ret > 0) {
4202 /* the extent was freed by someone */
4203 if (ref_path->lowest_level == level)
4204 goto out;
4205 btrfs_release_path(extent_root, path);
4206 goto walk_down;
4207 }
4208 leaf = path->nodes[0];
4209 }
4210
4211 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4212 if (found_key.objectid != bytenr ||
4213 found_key.type != BTRFS_EXTENT_REF_KEY) {
4214 /* the extent was freed by someone */
4215 if (ref_path->lowest_level == level) {
4216 ret = 1;
4217 goto out;
4218 }
4219 btrfs_release_path(extent_root, path);
4220 goto walk_down;
4221 }
4222found:
4223 ref = btrfs_item_ptr(leaf, path->slots[0],
4224 struct btrfs_extent_ref);
4225 ref_objectid = btrfs_ref_objectid(leaf, ref);
4226 if (ref_objectid < BTRFS_FIRST_FREE_OBJECTID) {
4227 if (first_time) {
4228 level = (int)ref_objectid;
4229 BUG_ON(level >= BTRFS_MAX_LEVEL);
4230 ref_path->lowest_level = level;
4231 ref_path->current_level = level;
4232 ref_path->nodes[level] = bytenr;
4233 } else {
4234 WARN_ON(ref_objectid != level);
4235 }
4236 } else {
4237 WARN_ON(level != -1);
4238 }
4239 first_time = 0;
4240
4241 if (ref_path->lowest_level == level) {
4242 ref_path->owner_objectid = ref_objectid;
4243 ref_path->num_refs = btrfs_ref_num_refs(leaf, ref);
4244 }
4245
4246 /*
4247 * the block is tree root or the block isn't in reference
4248 * counted tree.
4249 */
4250 if (found_key.objectid == found_key.offset ||
4251 is_cowonly_root(btrfs_ref_root(leaf, ref))) {
4252 ref_path->root_objectid = btrfs_ref_root(leaf, ref);
4253 ref_path->root_generation =
4254 btrfs_ref_generation(leaf, ref);
4255 if (level < 0) {
4256 /* special reference from the tree log */
4257 ref_path->nodes[0] = found_key.offset;
4258 ref_path->current_level = 0;
4259 }
4260 ret = 0;
4261 goto out;
4262 }
4263
4264 level++;
4265 BUG_ON(ref_path->nodes[level] != 0);
4266 ref_path->nodes[level] = found_key.offset;
4267 ref_path->current_level = level;
4268
4269 /*
4270 * the reference was created in the running transaction,
4271 * no need to continue walking up.
4272 */
4273 if (btrfs_ref_generation(leaf, ref) == trans->transid) {
4274 ref_path->root_objectid = btrfs_ref_root(leaf, ref);
4275 ref_path->root_generation =
4276 btrfs_ref_generation(leaf, ref);
4277 ret = 0;
4278 goto out;
4279 }
4280
4281 btrfs_release_path(extent_root, path);
4282 cond_resched();
4283 }
4284 /* reached max tree level, but no tree root found. */
4285 BUG();
4286out:
4287 btrfs_free_path(path);
4288 return ret;
4289}
4290
4291static int btrfs_first_ref_path(struct btrfs_trans_handle *trans,
4292 struct btrfs_root *extent_root,
4293 struct btrfs_ref_path *ref_path,
4294 u64 extent_start)
4295{
4296 memset(ref_path, 0, sizeof(*ref_path));
4297 ref_path->extent_start = extent_start;
4298
4299 return __next_ref_path(trans, extent_root, ref_path, 1);
4300}
4301
4302static int btrfs_next_ref_path(struct btrfs_trans_handle *trans,
4303 struct btrfs_root *extent_root,
4304 struct btrfs_ref_path *ref_path)
4305{
4306 return __next_ref_path(trans, extent_root, ref_path, 0);
4307}
4308
4309static noinline int get_new_locations(struct inode *reloc_inode,
4310 struct btrfs_key *extent_key,
4311 u64 offset, int no_fragment,
4312 struct disk_extent **extents,
4313 int *nr_extents)
4314{
4315 struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
4316 struct btrfs_path *path;
4317 struct btrfs_file_extent_item *fi;
4318 struct extent_buffer *leaf;
4319 struct disk_extent *exts = *extents;
4320 struct btrfs_key found_key;
4321 u64 cur_pos;
4322 u64 last_byte;
4323 u32 nritems;
4324 int nr = 0;
4325 int max = *nr_extents;
4326 int ret;
4327
4328 WARN_ON(!no_fragment && *extents);
4329 if (!exts) {
4330 max = 1;
4331 exts = kmalloc(sizeof(*exts) * max, GFP_NOFS);
4332 if (!exts)
4333 return -ENOMEM;
4334 }
4335
4336 path = btrfs_alloc_path();
4337 BUG_ON(!path);
4338
4339 cur_pos = extent_key->objectid - offset;
4340 last_byte = extent_key->objectid + extent_key->offset;
4341 ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino,
4342 cur_pos, 0);
4343 if (ret < 0)
4344 goto out;
4345 if (ret > 0) {
4346 ret = -ENOENT;
4347 goto out;
4348 }
4349
4350 while (1) {
4351 leaf = path->nodes[0];
4352 nritems = btrfs_header_nritems(leaf);
4353 if (path->slots[0] >= nritems) {
4354 ret = btrfs_next_leaf(root, path);
4355 if (ret < 0)
4356 goto out;
4357 if (ret > 0)
4358 break;
4359 leaf = path->nodes[0];
4360 }
4361
4362 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4363 if (found_key.offset != cur_pos ||
4364 found_key.type != BTRFS_EXTENT_DATA_KEY ||
4365 found_key.objectid != reloc_inode->i_ino)
4366 break;
4367
4368 fi = btrfs_item_ptr(leaf, path->slots[0],
4369 struct btrfs_file_extent_item);
4370 if (btrfs_file_extent_type(leaf, fi) !=
4371 BTRFS_FILE_EXTENT_REG ||
4372 btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
4373 break;
4374
4375 if (nr == max) {
4376 struct disk_extent *old = exts;
4377 max *= 2;
4378 exts = kzalloc(sizeof(*exts) * max, GFP_NOFS);
4379 memcpy(exts, old, sizeof(*exts) * nr);
4380 if (old != *extents)
4381 kfree(old);
4382 }
4383
4384 exts[nr].disk_bytenr =
4385 btrfs_file_extent_disk_bytenr(leaf, fi);
4386 exts[nr].disk_num_bytes =
4387 btrfs_file_extent_disk_num_bytes(leaf, fi);
4388 exts[nr].offset = btrfs_file_extent_offset(leaf, fi);
4389 exts[nr].num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
4390 exts[nr].ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
4391 exts[nr].compression = btrfs_file_extent_compression(leaf, fi);
4392 exts[nr].encryption = btrfs_file_extent_encryption(leaf, fi);
4393 exts[nr].other_encoding = btrfs_file_extent_other_encoding(leaf,
4394 fi);
4395 BUG_ON(exts[nr].offset > 0);
4396 BUG_ON(exts[nr].compression || exts[nr].encryption);
4397 BUG_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes);
4398
4399 cur_pos += exts[nr].num_bytes;
4400 nr++;
4401
4402 if (cur_pos + offset >= last_byte)
4403 break;
4404
4405 if (no_fragment) {
4406 ret = 1;
4407 goto out;
4408 }
4409 path->slots[0]++;
4410 }
4411
4412 BUG_ON(cur_pos + offset > last_byte);
4413 if (cur_pos + offset < last_byte) {
4414 ret = -ENOENT;
4415 goto out;
4416 }
4417 ret = 0;
4418out:
4419 btrfs_free_path(path);
4420 if (ret) {
4421 if (exts != *extents)
4422 kfree(exts);
4423 } else {
4424 *extents = exts;
4425 *nr_extents = nr;
4426 }
4427 return ret;
4428}
4429
4430static noinline int replace_one_extent(struct btrfs_trans_handle *trans,
4431 struct btrfs_root *root,
4432 struct btrfs_path *path,
4433 struct btrfs_key *extent_key,
4434 struct btrfs_key *leaf_key,
4435 struct btrfs_ref_path *ref_path,
4436 struct disk_extent *new_extents,
4437 int nr_extents)
4438{
4439 struct extent_buffer *leaf;
4440 struct btrfs_file_extent_item *fi;
4441 struct inode *inode = NULL;
4442 struct btrfs_key key;
4443 u64 lock_start = 0;
4444 u64 lock_end = 0;
4445 u64 num_bytes;
4446 u64 ext_offset;
4447 u64 first_pos;
4448 u32 nritems;
4449 int nr_scaned = 0;
4450 int extent_locked = 0;
4451 int extent_type;
4452 int ret;
4453
4454 memcpy(&key, leaf_key, sizeof(key));
4455 first_pos = INT_LIMIT(loff_t) - extent_key->offset;
4456 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
4457 if (key.objectid < ref_path->owner_objectid ||
4458 (key.objectid == ref_path->owner_objectid &&
4459 key.type < BTRFS_EXTENT_DATA_KEY)) {
4460 key.objectid = ref_path->owner_objectid;
4461 key.type = BTRFS_EXTENT_DATA_KEY;
4462 key.offset = 0;
4463 }
4464 }
4465
4466 while (1) {
4467 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
4468 if (ret < 0)
4469 goto out;
4470
4471 leaf = path->nodes[0];
4472 nritems = btrfs_header_nritems(leaf);
4473next:
4474 if (extent_locked && ret > 0) {
4475 /*
4476 * the file extent item was modified by someone
4477 * before the extent got locked.
4478 */
4479 unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
4480 lock_end, GFP_NOFS);
4481 extent_locked = 0;
4482 }
4483
4484 if (path->slots[0] >= nritems) {
4485 if (++nr_scaned > 2)
4486 break;
4487
4488 BUG_ON(extent_locked);
4489 ret = btrfs_next_leaf(root, path);
4490 if (ret < 0)
4491 goto out;
4492 if (ret > 0)
4493 break;
4494 leaf = path->nodes[0];
4495 nritems = btrfs_header_nritems(leaf);
4496 }
4497
4498 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4499
4500 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
4501 if ((key.objectid > ref_path->owner_objectid) ||
4502 (key.objectid == ref_path->owner_objectid &&
4503 key.type > BTRFS_EXTENT_DATA_KEY) ||
4504 (key.offset >= first_pos + extent_key->offset))
4505 break;
4506 }
4507
4508 if (inode && key.objectid != inode->i_ino) {
4509 BUG_ON(extent_locked);
4510 btrfs_release_path(root, path);
4511 mutex_unlock(&inode->i_mutex);
4512 iput(inode);
4513 inode = NULL;
4514 continue;
4515 }
4516
4517 if (key.type != BTRFS_EXTENT_DATA_KEY) {
4518 path->slots[0]++;
4519 ret = 1;
4520 goto next;
4521 }
4522 fi = btrfs_item_ptr(leaf, path->slots[0],
4523 struct btrfs_file_extent_item);
4524 extent_type = btrfs_file_extent_type(leaf, fi);
4525 if ((extent_type != BTRFS_FILE_EXTENT_REG &&
4526 extent_type != BTRFS_FILE_EXTENT_PREALLOC) ||
4527 (btrfs_file_extent_disk_bytenr(leaf, fi) !=
4528 extent_key->objectid)) {
4529 path->slots[0]++;
4530 ret = 1;
4531 goto next;
4532 }
4533
4534 num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
4535 ext_offset = btrfs_file_extent_offset(leaf, fi);
4536
4537 if (first_pos > key.offset - ext_offset)
4538 first_pos = key.offset - ext_offset;
4539
4540 if (!extent_locked) {
4541 lock_start = key.offset;
4542 lock_end = lock_start + num_bytes - 1;
4543 } else {
4544 if (lock_start > key.offset ||
4545 lock_end + 1 < key.offset + num_bytes) {
4546 unlock_extent(&BTRFS_I(inode)->io_tree,
4547 lock_start, lock_end, GFP_NOFS);
4548 extent_locked = 0;
4549 }
4550 }
4551
4552 if (!inode) {
4553 btrfs_release_path(root, path);
4554
4555 inode = btrfs_iget_locked(root->fs_info->sb,
4556 key.objectid, root);
4557 if (inode->i_state & I_NEW) {
4558 BTRFS_I(inode)->root = root;
4559 BTRFS_I(inode)->location.objectid =
4560 key.objectid;
4561 BTRFS_I(inode)->location.type =
4562 BTRFS_INODE_ITEM_KEY;
4563 BTRFS_I(inode)->location.offset = 0;
4564 btrfs_read_locked_inode(inode);
4565 unlock_new_inode(inode);
4566 }
4567 /*
4568 * some code call btrfs_commit_transaction while
4569 * holding the i_mutex, so we can't use mutex_lock
4570 * here.
4571 */
4572 if (is_bad_inode(inode) ||
4573 !mutex_trylock(&inode->i_mutex)) {
4574 iput(inode);
4575 inode = NULL;
4576 key.offset = (u64)-1;
4577 goto skip;
4578 }
4579 }
4580
4581 if (!extent_locked) {
4582 struct btrfs_ordered_extent *ordered;
4583
4584 btrfs_release_path(root, path);
4585
4586 lock_extent(&BTRFS_I(inode)->io_tree, lock_start,
4587 lock_end, GFP_NOFS);
4588 ordered = btrfs_lookup_first_ordered_extent(inode,
4589 lock_end);
4590 if (ordered &&
4591 ordered->file_offset <= lock_end &&
4592 ordered->file_offset + ordered->len > lock_start) {
4593 unlock_extent(&BTRFS_I(inode)->io_tree,
4594 lock_start, lock_end, GFP_NOFS);
4595 btrfs_start_ordered_extent(inode, ordered, 1);
4596 btrfs_put_ordered_extent(ordered);
4597 key.offset += num_bytes;
4598 goto skip;
4599 }
4600 if (ordered)
4601 btrfs_put_ordered_extent(ordered);
4602
4603 extent_locked = 1;
4604 continue;
4605 }
4606
4607 if (nr_extents == 1) {
4608 /* update extent pointer in place */
4609 btrfs_set_file_extent_disk_bytenr(leaf, fi,
4610 new_extents[0].disk_bytenr);
4611 btrfs_set_file_extent_disk_num_bytes(leaf, fi,
4612 new_extents[0].disk_num_bytes);
4613 btrfs_mark_buffer_dirty(leaf);
4614
4615 btrfs_drop_extent_cache(inode, key.offset,
4616 key.offset + num_bytes - 1, 0);
4617
4618 ret = btrfs_inc_extent_ref(trans, root,
4619 new_extents[0].disk_bytenr,
4620 new_extents[0].disk_num_bytes,
4621 leaf->start,
4622 root->root_key.objectid,
4623 trans->transid,
4624 key.objectid);
4625 BUG_ON(ret);
4626
4627 ret = btrfs_free_extent(trans, root,
4628 extent_key->objectid,
4629 extent_key->offset,
4630 leaf->start,
4631 btrfs_header_owner(leaf),
4632 btrfs_header_generation(leaf),
4633 key.objectid, 0);
4634 BUG_ON(ret);
4635
4636 btrfs_release_path(root, path);
4637 key.offset += num_bytes;
4638 } else {
4639 BUG_ON(1);
4640#if 0
4641 u64 alloc_hint;
4642 u64 extent_len;
4643 int i;
4644 /*
4645 * drop old extent pointer at first, then insert the
4646 * new pointers one bye one
4647 */
4648 btrfs_release_path(root, path);
4649 ret = btrfs_drop_extents(trans, root, inode, key.offset,
4650 key.offset + num_bytes,
4651 key.offset, &alloc_hint);
4652 BUG_ON(ret);
4653
4654 for (i = 0; i < nr_extents; i++) {
4655 if (ext_offset >= new_extents[i].num_bytes) {
4656 ext_offset -= new_extents[i].num_bytes;
4657 continue;
4658 }
4659 extent_len = min(new_extents[i].num_bytes -
4660 ext_offset, num_bytes);
4661
4662 ret = btrfs_insert_empty_item(trans, root,
4663 path, &key,
4664 sizeof(*fi));
4665 BUG_ON(ret);
4666
4667 leaf = path->nodes[0];
4668 fi = btrfs_item_ptr(leaf, path->slots[0],
4669 struct btrfs_file_extent_item);
4670 btrfs_set_file_extent_generation(leaf, fi,
4671 trans->transid);
4672 btrfs_set_file_extent_type(leaf, fi,
4673 BTRFS_FILE_EXTENT_REG);
4674 btrfs_set_file_extent_disk_bytenr(leaf, fi,
4675 new_extents[i].disk_bytenr);
4676 btrfs_set_file_extent_disk_num_bytes(leaf, fi,
4677 new_extents[i].disk_num_bytes);
4678 btrfs_set_file_extent_ram_bytes(leaf, fi,
4679 new_extents[i].ram_bytes);
4680
4681 btrfs_set_file_extent_compression(leaf, fi,
4682 new_extents[i].compression);
4683 btrfs_set_file_extent_encryption(leaf, fi,
4684 new_extents[i].encryption);
4685 btrfs_set_file_extent_other_encoding(leaf, fi,
4686 new_extents[i].other_encoding);
4687
4688 btrfs_set_file_extent_num_bytes(leaf, fi,
4689 extent_len);
4690 ext_offset += new_extents[i].offset;
4691 btrfs_set_file_extent_offset(leaf, fi,
4692 ext_offset);
4693 btrfs_mark_buffer_dirty(leaf);
4694
4695 btrfs_drop_extent_cache(inode, key.offset,
4696 key.offset + extent_len - 1, 0);
4697
4698 ret = btrfs_inc_extent_ref(trans, root,
4699 new_extents[i].disk_bytenr,
4700 new_extents[i].disk_num_bytes,
4701 leaf->start,
4702 root->root_key.objectid,
4703 trans->transid, key.objectid);
4704 BUG_ON(ret);
4705 btrfs_release_path(root, path);
4706
4707 inode_add_bytes(inode, extent_len);
4708
4709 ext_offset = 0;
4710 num_bytes -= extent_len;
4711 key.offset += extent_len;
4712
4713 if (num_bytes == 0)
4714 break;
4715 }
4716 BUG_ON(i >= nr_extents);
4717#endif
4718 }
4719
4720 if (extent_locked) {
4721 unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
4722 lock_end, GFP_NOFS);
4723 extent_locked = 0;
4724 }
4725skip:
4726 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS &&
4727 key.offset >= first_pos + extent_key->offset)
4728 break;
4729
4730 cond_resched();
4731 }
4732 ret = 0;
4733out:
4734 btrfs_release_path(root, path);
4735 if (inode) {
4736 mutex_unlock(&inode->i_mutex);
4737 if (extent_locked) {
4738 unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
4739 lock_end, GFP_NOFS);
4740 }
4741 iput(inode);
4742 }
4743 return ret;
4744}
4745
4746int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
4747 struct btrfs_root *root,
4748 struct extent_buffer *buf, u64 orig_start)
4749{
4750 int level;
4751 int ret;
4752
4753 BUG_ON(btrfs_header_generation(buf) != trans->transid);
4754 BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
4755
4756 level = btrfs_header_level(buf);
4757 if (level == 0) {
4758 struct btrfs_leaf_ref *ref;
4759 struct btrfs_leaf_ref *orig_ref;
4760
4761 orig_ref = btrfs_lookup_leaf_ref(root, orig_start);
4762 if (!orig_ref)
4763 return -ENOENT;
4764
4765 ref = btrfs_alloc_leaf_ref(root, orig_ref->nritems);
4766 if (!ref) {
4767 btrfs_free_leaf_ref(root, orig_ref);
4768 return -ENOMEM;
4769 }
4770
4771 ref->nritems = orig_ref->nritems;
4772 memcpy(ref->extents, orig_ref->extents,
4773 sizeof(ref->extents[0]) * ref->nritems);
4774
4775 btrfs_free_leaf_ref(root, orig_ref);
4776
4777 ref->root_gen = trans->transid;
4778 ref->bytenr = buf->start;
4779 ref->owner = btrfs_header_owner(buf);
4780 ref->generation = btrfs_header_generation(buf);
4781 ret = btrfs_add_leaf_ref(root, ref, 0);
4782 WARN_ON(ret);
4783 btrfs_free_leaf_ref(root, ref);
4784 }
4785 return 0;
4786}
4787
4788static noinline int invalidate_extent_cache(struct btrfs_root *root,
4789 struct extent_buffer *leaf,
4790 struct btrfs_block_group_cache *group,
4791 struct btrfs_root *target_root)
4792{
4793 struct btrfs_key key;
4794 struct inode *inode = NULL;
4795 struct btrfs_file_extent_item *fi;
4796 u64 num_bytes;
4797 u64 skip_objectid = 0;
4798 u32 nritems;
4799 u32 i;
4800
4801 nritems = btrfs_header_nritems(leaf);
4802 for (i = 0; i < nritems; i++) {
4803 btrfs_item_key_to_cpu(leaf, &key, i);
4804 if (key.objectid == skip_objectid ||
4805 key.type != BTRFS_EXTENT_DATA_KEY)
4806 continue;
4807 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
4808 if (btrfs_file_extent_type(leaf, fi) ==
4809 BTRFS_FILE_EXTENT_INLINE)
4810 continue;
4811 if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
4812 continue;
4813 if (!inode || inode->i_ino != key.objectid) {
4814 iput(inode);
4815 inode = btrfs_ilookup(target_root->fs_info->sb,
4816 key.objectid, target_root, 1);
4817 }
4818 if (!inode) {
4819 skip_objectid = key.objectid;
4820 continue;
4821 }
4822 num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
4823
4824 lock_extent(&BTRFS_I(inode)->io_tree, key.offset,
4825 key.offset + num_bytes - 1, GFP_NOFS);
4826 btrfs_drop_extent_cache(inode, key.offset,
4827 key.offset + num_bytes - 1, 1);
4828 unlock_extent(&BTRFS_I(inode)->io_tree, key.offset,
4829 key.offset + num_bytes - 1, GFP_NOFS);
4830 cond_resched();
4831 }
4832 iput(inode);
4833 return 0;
4834}
4835
4836static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans,
4837 struct btrfs_root *root,
4838 struct extent_buffer *leaf,
4839 struct btrfs_block_group_cache *group,
4840 struct inode *reloc_inode)
4841{
4842 struct btrfs_key key;
4843 struct btrfs_key extent_key;
4844 struct btrfs_file_extent_item *fi;
4845 struct btrfs_leaf_ref *ref;
4846 struct disk_extent *new_extent;
4847 u64 bytenr;
4848 u64 num_bytes;
4849 u32 nritems;
4850 u32 i;
4851 int ext_index;
4852 int nr_extent;
4853 int ret;
4854
4855 new_extent = kmalloc(sizeof(*new_extent), GFP_NOFS);
4856 BUG_ON(!new_extent);
4857
4858 ref = btrfs_lookup_leaf_ref(root, leaf->start);
4859 BUG_ON(!ref);
4860
4861 ext_index = -1;
4862 nritems = btrfs_header_nritems(leaf);
4863 for (i = 0; i < nritems; i++) {
4864 btrfs_item_key_to_cpu(leaf, &key, i);
4865 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
4866 continue;
4867 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
4868 if (btrfs_file_extent_type(leaf, fi) ==
4869 BTRFS_FILE_EXTENT_INLINE)
4870 continue;
4871 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
4872 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
4873 if (bytenr == 0)
4874 continue;
4875
4876 ext_index++;
4877 if (bytenr >= group->key.objectid + group->key.offset ||
4878 bytenr + num_bytes <= group->key.objectid)
4879 continue;
4880
4881 extent_key.objectid = bytenr;
4882 extent_key.offset = num_bytes;
4883 extent_key.type = BTRFS_EXTENT_ITEM_KEY;
4884 nr_extent = 1;
4885 ret = get_new_locations(reloc_inode, &extent_key,
4886 group->key.objectid, 1,
4887 &new_extent, &nr_extent);
4888 if (ret > 0)
4889 continue;
4890 BUG_ON(ret < 0);
4891
4892 BUG_ON(ref->extents[ext_index].bytenr != bytenr);
4893 BUG_ON(ref->extents[ext_index].num_bytes != num_bytes);
4894 ref->extents[ext_index].bytenr = new_extent->disk_bytenr;
4895 ref->extents[ext_index].num_bytes = new_extent->disk_num_bytes;
4896
4897 btrfs_set_file_extent_disk_bytenr(leaf, fi,
4898 new_extent->disk_bytenr);
4899 btrfs_set_file_extent_disk_num_bytes(leaf, fi,
4900 new_extent->disk_num_bytes);
4901 btrfs_mark_buffer_dirty(leaf);
4902
4903 ret = btrfs_inc_extent_ref(trans, root,
4904 new_extent->disk_bytenr,
4905 new_extent->disk_num_bytes,
4906 leaf->start,
4907 root->root_key.objectid,
4908 trans->transid, key.objectid);
4909 BUG_ON(ret);
4910 ret = btrfs_free_extent(trans, root,
4911 bytenr, num_bytes, leaf->start,
4912 btrfs_header_owner(leaf),
4913 btrfs_header_generation(leaf),
4914 key.objectid, 0);
4915 BUG_ON(ret);
4916 cond_resched();
4917 }
4918 kfree(new_extent);
4919 BUG_ON(ext_index + 1 != ref->nritems);
4920 btrfs_free_leaf_ref(root, ref);
4921 return 0;
4922}
4923
4924int btrfs_free_reloc_root(struct btrfs_trans_handle *trans,
4925 struct btrfs_root *root)
4926{
4927 struct btrfs_root *reloc_root;
4928 int ret;
4929
4930 if (root->reloc_root) {
4931 reloc_root = root->reloc_root;
4932 root->reloc_root = NULL;
4933 list_add(&reloc_root->dead_list,
4934 &root->fs_info->dead_reloc_roots);
4935
4936 btrfs_set_root_bytenr(&reloc_root->root_item,
4937 reloc_root->node->start);
4938 btrfs_set_root_level(&root->root_item,
4939 btrfs_header_level(reloc_root->node));
4940 memset(&reloc_root->root_item.drop_progress, 0,
4941 sizeof(struct btrfs_disk_key));
4942 reloc_root->root_item.drop_level = 0;
4943
4944 ret = btrfs_update_root(trans, root->fs_info->tree_root,
4945 &reloc_root->root_key,
4946 &reloc_root->root_item);
4947 BUG_ON(ret);
4948 }
4949 return 0;
4950}
4951
4952int btrfs_drop_dead_reloc_roots(struct btrfs_root *root)
4953{
4954 struct btrfs_trans_handle *trans;
4955 struct btrfs_root *reloc_root;
4956 struct btrfs_root *prev_root = NULL;
4957 struct list_head dead_roots;
4958 int ret;
4959 unsigned long nr;
4960
4961 INIT_LIST_HEAD(&dead_roots);
4962 list_splice_init(&root->fs_info->dead_reloc_roots, &dead_roots);
4963
4964 while (!list_empty(&dead_roots)) {
4965 reloc_root = list_entry(dead_roots.prev,
4966 struct btrfs_root, dead_list);
4967 list_del_init(&reloc_root->dead_list);
4968
4969 BUG_ON(reloc_root->commit_root != NULL);
4970 while (1) {
4971 trans = btrfs_join_transaction(root, 1);
4972 BUG_ON(!trans);
4973
4974 mutex_lock(&root->fs_info->drop_mutex);
4975 ret = btrfs_drop_snapshot(trans, reloc_root);
4976 if (ret != -EAGAIN)
4977 break;
4978 mutex_unlock(&root->fs_info->drop_mutex);
4979
4980 nr = trans->blocks_used;
4981 ret = btrfs_end_transaction(trans, root);
4982 BUG_ON(ret);
4983 btrfs_btree_balance_dirty(root, nr);
4984 }
4985
4986 free_extent_buffer(reloc_root->node);
4987
4988 ret = btrfs_del_root(trans, root->fs_info->tree_root,
4989 &reloc_root->root_key);
4990 BUG_ON(ret);
4991 mutex_unlock(&root->fs_info->drop_mutex);
4992
4993 nr = trans->blocks_used;
4994 ret = btrfs_end_transaction(trans, root);
4995 BUG_ON(ret);
4996 btrfs_btree_balance_dirty(root, nr);
4997
4998 kfree(prev_root);
4999 prev_root = reloc_root;
5000 }
5001 if (prev_root) {
5002 btrfs_remove_leaf_refs(prev_root, (u64)-1, 0);
5003 kfree(prev_root);
5004 }
5005 return 0;
5006}
5007
5008int btrfs_add_dead_reloc_root(struct btrfs_root *root)
5009{
5010 list_add(&root->dead_list, &root->fs_info->dead_reloc_roots);
5011 return 0;
5012}
5013
5014int btrfs_cleanup_reloc_trees(struct btrfs_root *root)
5015{
5016 struct btrfs_root *reloc_root;
5017 struct btrfs_trans_handle *trans;
5018 struct btrfs_key location;
5019 int found;
5020 int ret;
5021
5022 mutex_lock(&root->fs_info->tree_reloc_mutex);
5023 ret = btrfs_find_dead_roots(root, BTRFS_TREE_RELOC_OBJECTID, NULL);
5024 BUG_ON(ret);
5025 found = !list_empty(&root->fs_info->dead_reloc_roots);
5026 mutex_unlock(&root->fs_info->tree_reloc_mutex);
5027
5028 if (found) {
5029 trans = btrfs_start_transaction(root, 1);
5030 BUG_ON(!trans);
5031 ret = btrfs_commit_transaction(trans, root);
5032 BUG_ON(ret);
5033 }
5034
5035 location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
5036 location.offset = (u64)-1;
5037 location.type = BTRFS_ROOT_ITEM_KEY;
5038
5039 reloc_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
5040 BUG_ON(!reloc_root);
5041 btrfs_orphan_cleanup(reloc_root);
5042 return 0;
5043}
5044
5045static noinline int init_reloc_tree(struct btrfs_trans_handle *trans,
5046 struct btrfs_root *root)
5047{
5048 struct btrfs_root *reloc_root;
5049 struct extent_buffer *eb;
5050 struct btrfs_root_item *root_item;
5051 struct btrfs_key root_key;
5052 int ret;
5053
5054 BUG_ON(!root->ref_cows);
5055 if (root->reloc_root)
5056 return 0;
5057
5058 root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
5059 BUG_ON(!root_item);
5060
5061 ret = btrfs_copy_root(trans, root, root->commit_root,
5062 &eb, BTRFS_TREE_RELOC_OBJECTID);
5063 BUG_ON(ret);
5064
5065 root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
5066 root_key.offset = root->root_key.objectid;
5067 root_key.type = BTRFS_ROOT_ITEM_KEY;
5068
5069 memcpy(root_item, &root->root_item, sizeof(root_item));
5070 btrfs_set_root_refs(root_item, 0);
5071 btrfs_set_root_bytenr(root_item, eb->start);
5072 btrfs_set_root_level(root_item, btrfs_header_level(eb));
5073 btrfs_set_root_generation(root_item, trans->transid);
5074
5075 btrfs_tree_unlock(eb);
5076 free_extent_buffer(eb);
5077
5078 ret = btrfs_insert_root(trans, root->fs_info->tree_root,
5079 &root_key, root_item);
5080 BUG_ON(ret);
5081 kfree(root_item);
5082
5083 reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
5084 &root_key);
5085 BUG_ON(!reloc_root);
5086 reloc_root->last_trans = trans->transid;
5087 reloc_root->commit_root = NULL;
5088 reloc_root->ref_tree = &root->fs_info->reloc_ref_tree;
5089
5090 root->reloc_root = reloc_root;
5091 return 0;
5092}
5093
5094/*
5095 * Core function of space balance.
5096 *
5097 * The idea is using reloc trees to relocate tree blocks in reference
5098 * counted roots. There is one reloc tree for each subvol, and all
5099 * reloc trees share same root key objectid. Reloc trees are snapshots
5100 * of the latest committed roots of subvols (root->commit_root).
5101 *
5102 * To relocate a tree block referenced by a subvol, there are two steps.
5103 * COW the block through subvol's reloc tree, then update block pointer
5104 * in the subvol to point to the new block. Since all reloc trees share
5105 * same root key objectid, doing special handing for tree blocks owned
5106 * by them is easy. Once a tree block has been COWed in one reloc tree,
5107 * we can use the resulting new block directly when the same block is
5108 * required to COW again through other reloc trees. By this way, relocated
5109 * tree blocks are shared between reloc trees, so they are also shared
5110 * between subvols.
5111 */
5112static noinline int relocate_one_path(struct btrfs_trans_handle *trans,
5113 struct btrfs_root *root,
5114 struct btrfs_path *path,
5115 struct btrfs_key *first_key,
5116 struct btrfs_ref_path *ref_path,
5117 struct btrfs_block_group_cache *group,
5118 struct inode *reloc_inode)
5119{
5120 struct btrfs_root *reloc_root;
5121 struct extent_buffer *eb = NULL;
5122 struct btrfs_key *keys;
5123 u64 *nodes;
5124 int level;
5125 int shared_level;
5126 int lowest_level = 0;
5127 int ret;
5128
5129 if (ref_path->owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
5130 lowest_level = ref_path->owner_objectid;
5131
5132 if (!root->ref_cows) {
5133 path->lowest_level = lowest_level;
5134 ret = btrfs_search_slot(trans, root, first_key, path, 0, 1);
5135 BUG_ON(ret < 0);
5136 path->lowest_level = 0;
5137 btrfs_release_path(root, path);
5138 return 0;
5139 }
5140
5141 mutex_lock(&root->fs_info->tree_reloc_mutex);
5142 ret = init_reloc_tree(trans, root);
5143 BUG_ON(ret);
5144 reloc_root = root->reloc_root;
5145
5146 shared_level = ref_path->shared_level;
5147 ref_path->shared_level = BTRFS_MAX_LEVEL - 1;
5148
5149 keys = ref_path->node_keys;
5150 nodes = ref_path->new_nodes;
5151 memset(&keys[shared_level + 1], 0,
5152 sizeof(*keys) * (BTRFS_MAX_LEVEL - shared_level - 1));
5153 memset(&nodes[shared_level + 1], 0,
5154 sizeof(*nodes) * (BTRFS_MAX_LEVEL - shared_level - 1));
5155
5156 if (nodes[lowest_level] == 0) {
5157 path->lowest_level = lowest_level;
5158 ret = btrfs_search_slot(trans, reloc_root, first_key, path,
5159 0, 1);
5160 BUG_ON(ret);
5161 for (level = lowest_level; level < BTRFS_MAX_LEVEL; level++) {
5162 eb = path->nodes[level];
5163 if (!eb || eb == reloc_root->node)
5164 break;
5165 nodes[level] = eb->start;
5166 if (level == 0)
5167 btrfs_item_key_to_cpu(eb, &keys[level], 0);
5168 else
5169 btrfs_node_key_to_cpu(eb, &keys[level], 0);
5170 }
5171 if (nodes[0] &&
5172 ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
5173 eb = path->nodes[0];
5174 ret = replace_extents_in_leaf(trans, reloc_root, eb,
5175 group, reloc_inode);
5176 BUG_ON(ret);
5177 }
5178 btrfs_release_path(reloc_root, path);
5179 } else {
5180 ret = btrfs_merge_path(trans, reloc_root, keys, nodes,
5181 lowest_level);
5182 BUG_ON(ret);
5183 }
5184
5185 /*
5186 * replace tree blocks in the fs tree with tree blocks in
5187 * the reloc tree.
5188 */
5189 ret = btrfs_merge_path(trans, root, keys, nodes, lowest_level);
5190 BUG_ON(ret < 0);
5191
5192 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
5193 ret = btrfs_search_slot(trans, reloc_root, first_key, path,
5194 0, 0);
5195 BUG_ON(ret);
5196 extent_buffer_get(path->nodes[0]);
5197 eb = path->nodes[0];
5198 btrfs_release_path(reloc_root, path);
5199 ret = invalidate_extent_cache(reloc_root, eb, group, root);
5200 BUG_ON(ret);
5201 free_extent_buffer(eb);
5202 }
5203
5204 mutex_unlock(&root->fs_info->tree_reloc_mutex);
5205 path->lowest_level = 0;
5206 return 0;
5207}
5208
5209static noinline int relocate_tree_block(struct btrfs_trans_handle *trans,
5210 struct btrfs_root *root,
5211 struct btrfs_path *path,
5212 struct btrfs_key *first_key,
5213 struct btrfs_ref_path *ref_path)
5214{
5215 int ret;
5216
5217 ret = relocate_one_path(trans, root, path, first_key,
5218 ref_path, NULL, NULL);
5219 BUG_ON(ret);
5220
5221 if (root == root->fs_info->extent_root)
5222 btrfs_extent_post_op(trans, root);
5223
5224 return 0;
5225}
5226
5227static noinline int del_extent_zero(struct btrfs_trans_handle *trans,
5228 struct btrfs_root *extent_root,
5229 struct btrfs_path *path,
5230 struct btrfs_key *extent_key)
5231{
5232 int ret;
5233
5234 ret = btrfs_search_slot(trans, extent_root, extent_key, path, -1, 1);
5235 if (ret)
5236 goto out;
5237 ret = btrfs_del_item(trans, extent_root, path);
5238out:
5239 btrfs_release_path(extent_root, path);
5240 return ret;
5241}
5242
5243static noinline struct btrfs_root *read_ref_root(struct btrfs_fs_info *fs_info,
5244 struct btrfs_ref_path *ref_path)
5245{
5246 struct btrfs_key root_key;
5247
5248 root_key.objectid = ref_path->root_objectid;
5249 root_key.type = BTRFS_ROOT_ITEM_KEY;
5250 if (is_cowonly_root(ref_path->root_objectid))
5251 root_key.offset = 0;
5252 else
5253 root_key.offset = (u64)-1;
5254
5255 return btrfs_read_fs_root_no_name(fs_info, &root_key);
5256}
5257
5258static noinline int relocate_one_extent(struct btrfs_root *extent_root,
5259 struct btrfs_path *path,
5260 struct btrfs_key *extent_key,
5261 struct btrfs_block_group_cache *group,
5262 struct inode *reloc_inode, int pass)
5263{
5264 struct btrfs_trans_handle *trans;
5265 struct btrfs_root *found_root;
5266 struct btrfs_ref_path *ref_path = NULL;
5267 struct disk_extent *new_extents = NULL;
5268 int nr_extents = 0;
5269 int loops;
5270 int ret;
5271 int level;
5272 struct btrfs_key first_key;
5273 u64 prev_block = 0;
5274
5275
5276 trans = btrfs_start_transaction(extent_root, 1);
5277 BUG_ON(!trans);
5278
5279 if (extent_key->objectid == 0) {
5280 ret = del_extent_zero(trans, extent_root, path, extent_key);
5281 goto out;
5282 }
5283
5284 ref_path = kmalloc(sizeof(*ref_path), GFP_NOFS);
5285 if (!ref_path) {
5286 ret = -ENOMEM;
5287 goto out;
5288 }
5289
5290 for (loops = 0; ; loops++) {
5291 if (loops == 0) {
5292 ret = btrfs_first_ref_path(trans, extent_root, ref_path,
5293 extent_key->objectid);
5294 } else {
5295 ret = btrfs_next_ref_path(trans, extent_root, ref_path);
5296 }
5297 if (ret < 0)
5298 goto out;
5299 if (ret > 0)
5300 break;
5301
5302 if (ref_path->root_objectid == BTRFS_TREE_LOG_OBJECTID ||
5303 ref_path->root_objectid == BTRFS_TREE_RELOC_OBJECTID)
5304 continue;
5305
5306 found_root = read_ref_root(extent_root->fs_info, ref_path);
5307 BUG_ON(!found_root);
5308 /*
5309 * for reference counted tree, only process reference paths
5310 * rooted at the latest committed root.
5311 */
5312 if (found_root->ref_cows &&
5313 ref_path->root_generation != found_root->root_key.offset)
5314 continue;
5315
5316 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
5317 if (pass == 0) {
5318 /*
5319 * copy data extents to new locations
5320 */
5321 u64 group_start = group->key.objectid;
5322 ret = relocate_data_extent(reloc_inode,
5323 extent_key,
5324 group_start);
5325 if (ret < 0)
5326 goto out;
5327 break;
5328 }
5329 level = 0;
5330 } else {
5331 level = ref_path->owner_objectid;
5332 }
5333
5334 if (prev_block != ref_path->nodes[level]) {
5335 struct extent_buffer *eb;
5336 u64 block_start = ref_path->nodes[level];
5337 u64 block_size = btrfs_level_size(found_root, level);
5338
5339 eb = read_tree_block(found_root, block_start,
5340 block_size, 0);
5341 btrfs_tree_lock(eb);
5342 BUG_ON(level != btrfs_header_level(eb));
5343
5344 if (level == 0)
5345 btrfs_item_key_to_cpu(eb, &first_key, 0);
5346 else
5347 btrfs_node_key_to_cpu(eb, &first_key, 0);
5348
5349 btrfs_tree_unlock(eb);
5350 free_extent_buffer(eb);
5351 prev_block = block_start;
5352 }
5353
5354 btrfs_record_root_in_trans(found_root);
5355 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
5356 /*
5357 * try to update data extent references while
5358 * keeping metadata shared between snapshots.
5359 */
5360 if (pass == 1) {
5361 ret = relocate_one_path(trans, found_root,
5362 path, &first_key, ref_path,
5363 group, reloc_inode);
5364 if (ret < 0)
5365 goto out;
5366 continue;
5367 }
5368 /*
5369 * use fallback method to process the remaining
5370 * references.
5371 */
5372 if (!new_extents) {
5373 u64 group_start = group->key.objectid;
5374 new_extents = kmalloc(sizeof(*new_extents),
5375 GFP_NOFS);
5376 nr_extents = 1;
5377 ret = get_new_locations(reloc_inode,
5378 extent_key,
5379 group_start, 1,
5380 &new_extents,
5381 &nr_extents);
5382 if (ret)
5383 goto out;
5384 }
5385 ret = replace_one_extent(trans, found_root,
5386 path, extent_key,
5387 &first_key, ref_path,
5388 new_extents, nr_extents);
5389 } else {
5390 ret = relocate_tree_block(trans, found_root, path,
5391 &first_key, ref_path);
5392 }
5393 if (ret < 0)
5394 goto out;
5395 }
5396 ret = 0;
5397out:
5398 btrfs_end_transaction(trans, extent_root);
5399 kfree(new_extents);
5400 kfree(ref_path);
5401 return ret;
5402}
5403
5404static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
5405{
5406 u64 num_devices;
5407 u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
5408 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
5409
5410 num_devices = root->fs_info->fs_devices->rw_devices;
5411 if (num_devices == 1) {
5412 stripped |= BTRFS_BLOCK_GROUP_DUP;
5413 stripped = flags & ~stripped;
5414
5415 /* turn raid0 into single device chunks */
5416 if (flags & BTRFS_BLOCK_GROUP_RAID0)
5417 return stripped;
5418
5419 /* turn mirroring into duplication */
5420 if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
5421 BTRFS_BLOCK_GROUP_RAID10))
5422 return stripped | BTRFS_BLOCK_GROUP_DUP;
5423 return flags;
5424 } else {
5425 /* they already had raid on here, just return */
5426 if (flags & stripped)
5427 return flags;
5428
5429 stripped |= BTRFS_BLOCK_GROUP_DUP;
5430 stripped = flags & ~stripped;
5431
5432 /* switch duplicated blocks with raid1 */
5433 if (flags & BTRFS_BLOCK_GROUP_DUP)
5434 return stripped | BTRFS_BLOCK_GROUP_RAID1;
5435
5436 /* turn single device chunks into raid0 */
5437 return stripped | BTRFS_BLOCK_GROUP_RAID0;
5438 }
5439 return flags;
5440}
5441
5442static int __alloc_chunk_for_shrink(struct btrfs_root *root,
5443 struct btrfs_block_group_cache *shrink_block_group,
5444 int force)
5445{
5446 struct btrfs_trans_handle *trans;
5447 u64 new_alloc_flags;
5448 u64 calc;
5449
5450 spin_lock(&shrink_block_group->lock);
5451 if (btrfs_block_group_used(&shrink_block_group->item) > 0) {
5452 spin_unlock(&shrink_block_group->lock);
5453
5454 trans = btrfs_start_transaction(root, 1);
5455 spin_lock(&shrink_block_group->lock);
5456
5457 new_alloc_flags = update_block_group_flags(root,
5458 shrink_block_group->flags);
5459 if (new_alloc_flags != shrink_block_group->flags) {
5460 calc =
5461 btrfs_block_group_used(&shrink_block_group->item);
5462 } else {
5463 calc = shrink_block_group->key.offset;
5464 }
5465 spin_unlock(&shrink_block_group->lock);
5466
5467 do_chunk_alloc(trans, root->fs_info->extent_root,
5468 calc + 2 * 1024 * 1024, new_alloc_flags, force);
5469
5470 btrfs_end_transaction(trans, root);
5471 } else
5472 spin_unlock(&shrink_block_group->lock);
5473 return 0;
5474}
5475
5476static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
5477 struct btrfs_root *root,
5478 u64 objectid, u64 size)
5479{
5480 struct btrfs_path *path;
5481 struct btrfs_inode_item *item;
5482 struct extent_buffer *leaf;
5483 int ret;
5484
5485 path = btrfs_alloc_path();
5486 if (!path)
5487 return -ENOMEM;
5488
5489 ret = btrfs_insert_empty_inode(trans, root, path, objectid);
5490 if (ret)
5491 goto out;
5492
5493 leaf = path->nodes[0];
5494 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
5495 memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
5496 btrfs_set_inode_generation(leaf, item, 1);
5497 btrfs_set_inode_size(leaf, item, size);
5498 btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
5499 btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
5500 btrfs_mark_buffer_dirty(leaf);
5501 btrfs_release_path(root, path);
5502out:
5503 btrfs_free_path(path);
5504 return ret;
5505}
5506
5507static noinline struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
5508 struct btrfs_block_group_cache *group)
5509{
5510 struct inode *inode = NULL;
5511 struct btrfs_trans_handle *trans;
5512 struct btrfs_root *root;
5513 struct btrfs_key root_key;
5514 u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
5515 int err = 0;
5516
5517 root_key.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
5518 root_key.type = BTRFS_ROOT_ITEM_KEY;
5519 root_key.offset = (u64)-1;
5520 root = btrfs_read_fs_root_no_name(fs_info, &root_key);
5521 if (IS_ERR(root))
5522 return ERR_CAST(root);
5523
5524 trans = btrfs_start_transaction(root, 1);
5525 BUG_ON(!trans);
5526
5527 err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
5528 if (err)
5529 goto out;
5530
5531 err = __insert_orphan_inode(trans, root, objectid, group->key.offset);
5532 BUG_ON(err);
5533
5534 err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
5535 group->key.offset, 0, group->key.offset,
5536 0, 0, 0);
5537 BUG_ON(err);
5538
5539 inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
5540 if (inode->i_state & I_NEW) {
5541 BTRFS_I(inode)->root = root;
5542 BTRFS_I(inode)->location.objectid = objectid;
5543 BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
5544 BTRFS_I(inode)->location.offset = 0;
5545 btrfs_read_locked_inode(inode);
5546 unlock_new_inode(inode);
5547 BUG_ON(is_bad_inode(inode));
5548 } else {
5549 BUG_ON(1);
5550 }
5551 BTRFS_I(inode)->index_cnt = group->key.objectid;
5552
5553 err = btrfs_orphan_add(trans, inode);
5554out:
5555 btrfs_end_transaction(trans, root);
5556 if (err) {
5557 if (inode)
5558 iput(inode);
5559 inode = ERR_PTR(err);
5560 }
5561 return inode;
5562}
5563
5564int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
5565{
5566
5567 struct btrfs_ordered_sum *sums;
5568 struct btrfs_sector_sum *sector_sum;
5569 struct btrfs_ordered_extent *ordered;
5570 struct btrfs_root *root = BTRFS_I(inode)->root;
5571 struct list_head list;
5572 size_t offset;
5573 int ret;
5574 u64 disk_bytenr;
5575
5576 INIT_LIST_HEAD(&list);
5577
5578 ordered = btrfs_lookup_ordered_extent(inode, file_pos);
5579 BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
5580
5581 disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
5582 ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
5583 disk_bytenr + len - 1, &list);
5584
5585 while (!list_empty(&list)) {
5586 sums = list_entry(list.next, struct btrfs_ordered_sum, list);
5587 list_del_init(&sums->list);
5588
5589 sector_sum = sums->sums;
5590 sums->bytenr = ordered->start;
5591
5592 offset = 0;
5593 while (offset < sums->len) {
5594 sector_sum->bytenr += ordered->start - disk_bytenr;
5595 sector_sum++;
5596 offset += root->sectorsize;
5597 }
5598
5599 btrfs_add_ordered_sum(inode, ordered, sums);
5600 }
5601 btrfs_put_ordered_extent(ordered);
5602 return 0;
5603}
5604
5605int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start)
5606{
5607 struct btrfs_trans_handle *trans;
5608 struct btrfs_path *path;
5609 struct btrfs_fs_info *info = root->fs_info;
5610 struct extent_buffer *leaf;
5611 struct inode *reloc_inode;
5612 struct btrfs_block_group_cache *block_group;
5613 struct btrfs_key key;
5614 u64 skipped;
5615 u64 cur_byte;
5616 u64 total_found;
5617 u32 nritems;
5618 int ret;
5619 int progress;
5620 int pass = 0;
5621
5622 root = root->fs_info->extent_root;
5623
5624 block_group = btrfs_lookup_block_group(info, group_start);
5625 BUG_ON(!block_group);
5626
5627 printk(KERN_INFO "btrfs relocating block group %llu flags %llu\n",
5628 (unsigned long long)block_group->key.objectid,
5629 (unsigned long long)block_group->flags);
5630
5631 path = btrfs_alloc_path();
5632 BUG_ON(!path);
5633
5634 reloc_inode = create_reloc_inode(info, block_group);
5635 BUG_ON(IS_ERR(reloc_inode));
5636
5637 __alloc_chunk_for_shrink(root, block_group, 1);
5638 set_block_group_readonly(block_group);
5639
5640 btrfs_start_delalloc_inodes(info->tree_root);
5641 btrfs_wait_ordered_extents(info->tree_root, 0);
5642again:
5643 skipped = 0;
5644 total_found = 0;
5645 progress = 0;
5646 key.objectid = block_group->key.objectid;
5647 key.offset = 0;
5648 key.type = 0;
5649 cur_byte = key.objectid;
5650
5651 trans = btrfs_start_transaction(info->tree_root, 1);
5652 btrfs_commit_transaction(trans, info->tree_root);
5653
5654 mutex_lock(&root->fs_info->cleaner_mutex);
5655 btrfs_clean_old_snapshots(info->tree_root);
5656 btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1);
5657 mutex_unlock(&root->fs_info->cleaner_mutex);
5658
5659 while (1) {
5660 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5661 if (ret < 0)
5662 goto out;
5663next:
5664 leaf = path->nodes[0];
5665 nritems = btrfs_header_nritems(leaf);
5666 if (path->slots[0] >= nritems) {
5667 ret = btrfs_next_leaf(root, path);
5668 if (ret < 0)
5669 goto out;
5670 if (ret == 1) {
5671 ret = 0;
5672 break;
5673 }
5674 leaf = path->nodes[0];
5675 nritems = btrfs_header_nritems(leaf);
5676 }
5677
5678 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
5679
5680 if (key.objectid >= block_group->key.objectid +
5681 block_group->key.offset)
5682 break;
5683
5684 if (progress && need_resched()) {
5685 btrfs_release_path(root, path);
5686 cond_resched();
5687 progress = 0;
5688 continue;
5689 }
5690 progress = 1;
5691
5692 if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY ||
5693 key.objectid + key.offset <= cur_byte) {
5694 path->slots[0]++;
5695 goto next;
5696 }
5697
5698 total_found++;
5699 cur_byte = key.objectid + key.offset;
5700 btrfs_release_path(root, path);
5701
5702 __alloc_chunk_for_shrink(root, block_group, 0);
5703 ret = relocate_one_extent(root, path, &key, block_group,
5704 reloc_inode, pass);
5705 BUG_ON(ret < 0);
5706 if (ret > 0)
5707 skipped++;
5708
5709 key.objectid = cur_byte;
5710 key.type = 0;
5711 key.offset = 0;
5712 }
5713
5714 btrfs_release_path(root, path);
5715
5716 if (pass == 0) {
5717 btrfs_wait_ordered_range(reloc_inode, 0, (u64)-1);
5718 invalidate_mapping_pages(reloc_inode->i_mapping, 0, -1);
5719 }
5720
5721 if (total_found > 0) {
5722 printk(KERN_INFO "btrfs found %llu extents in pass %d\n",
5723 (unsigned long long)total_found, pass);
5724 pass++;
5725 if (total_found == skipped && pass > 2) {
5726 iput(reloc_inode);
5727 reloc_inode = create_reloc_inode(info, block_group);
5728 pass = 0;
5729 }
5730 goto again;
5731 }
5732
5733 /* delete reloc_inode */
5734 iput(reloc_inode);
5735
5736 /* unpin extents in this range */
5737 trans = btrfs_start_transaction(info->tree_root, 1);
5738 btrfs_commit_transaction(trans, info->tree_root);
5739
5740 spin_lock(&block_group->lock);
5741 WARN_ON(block_group->pinned > 0);
5742 WARN_ON(block_group->reserved > 0);
5743 WARN_ON(btrfs_block_group_used(&block_group->item) > 0);
5744 spin_unlock(&block_group->lock);
5745 put_block_group(block_group);
5746 ret = 0;
5747out:
5748 btrfs_free_path(path);
5749 return ret;
5750}
5751
5752static int find_first_block_group(struct btrfs_root *root,
5753 struct btrfs_path *path, struct btrfs_key *key)
5754{
5755 int ret = 0;
5756 struct btrfs_key found_key;
5757 struct extent_buffer *leaf;
5758 int slot;
5759
5760 ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
5761 if (ret < 0)
5762 goto out;
5763
5764 while (1) {
5765 slot = path->slots[0];
5766 leaf = path->nodes[0];
5767 if (slot >= btrfs_header_nritems(leaf)) {
5768 ret = btrfs_next_leaf(root, path);
5769 if (ret == 0)
5770 continue;
5771 if (ret < 0)
5772 goto out;
5773 break;
5774 }
5775 btrfs_item_key_to_cpu(leaf, &found_key, slot);
5776
5777 if (found_key.objectid >= key->objectid &&
5778 found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
5779 ret = 0;
5780 goto out;
5781 }
5782 path->slots[0]++;
5783 }
5784 ret = -ENOENT;
5785out:
5786 return ret;
5787}
5788
5789int btrfs_free_block_groups(struct btrfs_fs_info *info)
5790{
5791 struct btrfs_block_group_cache *block_group;
5792 struct rb_node *n;
5793
5794 spin_lock(&info->block_group_cache_lock);
5795 while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
5796 block_group = rb_entry(n, struct btrfs_block_group_cache,
5797 cache_node);
5798 rb_erase(&block_group->cache_node,
5799 &info->block_group_cache_tree);
5800 spin_unlock(&info->block_group_cache_lock);
5801
5802 btrfs_remove_free_space_cache(block_group);
5803 down_write(&block_group->space_info->groups_sem);
5804 list_del(&block_group->list);
5805 up_write(&block_group->space_info->groups_sem);
5806
5807 WARN_ON(atomic_read(&block_group->count) != 1);
5808 kfree(block_group);
5809
5810 spin_lock(&info->block_group_cache_lock);
5811 }
5812 spin_unlock(&info->block_group_cache_lock);
5813 return 0;
5814}
5815
5816int btrfs_read_block_groups(struct btrfs_root *root)
5817{
5818 struct btrfs_path *path;
5819 int ret;
5820 struct btrfs_block_group_cache *cache;
5821 struct btrfs_fs_info *info = root->fs_info;
5822 struct btrfs_space_info *space_info;
5823 struct btrfs_key key;
5824 struct btrfs_key found_key;
5825 struct extent_buffer *leaf;
5826
5827 root = info->extent_root;
5828 key.objectid = 0;
5829 key.offset = 0;
5830 btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
5831 path = btrfs_alloc_path();
5832 if (!path)
5833 return -ENOMEM;
5834
5835 while (1) {
5836 ret = find_first_block_group(root, path, &key);
5837 if (ret > 0) {
5838 ret = 0;
5839 goto error;
5840 }
5841 if (ret != 0)
5842 goto error;
5843
5844 leaf = path->nodes[0];
5845 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5846 cache = kzalloc(sizeof(*cache), GFP_NOFS);
5847 if (!cache) {
5848 ret = -ENOMEM;
5849 break;
5850 }
5851
5852 atomic_set(&cache->count, 1);
5853 spin_lock_init(&cache->lock);
5854 mutex_init(&cache->alloc_mutex);
5855 mutex_init(&cache->cache_mutex);
5856 INIT_LIST_HEAD(&cache->list);
5857 read_extent_buffer(leaf, &cache->item,
5858 btrfs_item_ptr_offset(leaf, path->slots[0]),
5859 sizeof(cache->item));
5860 memcpy(&cache->key, &found_key, sizeof(found_key));
5861
5862 key.objectid = found_key.objectid + found_key.offset;
5863 btrfs_release_path(root, path);
5864 cache->flags = btrfs_block_group_flags(&cache->item);
5865
5866 ret = update_space_info(info, cache->flags, found_key.offset,
5867 btrfs_block_group_used(&cache->item),
5868 &space_info);
5869 BUG_ON(ret);
5870 cache->space_info = space_info;
5871 down_write(&space_info->groups_sem);
5872 list_add_tail(&cache->list, &space_info->block_groups);
5873 up_write(&space_info->groups_sem);
5874
5875 ret = btrfs_add_block_group_cache(root->fs_info, cache);
5876 BUG_ON(ret);
5877
5878 set_avail_alloc_bits(root->fs_info, cache->flags);
5879 if (btrfs_chunk_readonly(root, cache->key.objectid))
5880 set_block_group_readonly(cache);
5881 }
5882 ret = 0;
5883error:
5884 btrfs_free_path(path);
5885 return ret;
5886}
5887
5888int btrfs_make_block_group(struct btrfs_trans_handle *trans,
5889 struct btrfs_root *root, u64 bytes_used,
5890 u64 type, u64 chunk_objectid, u64 chunk_offset,
5891 u64 size)
5892{
5893 int ret;
5894 struct btrfs_root *extent_root;
5895 struct btrfs_block_group_cache *cache;
5896
5897 extent_root = root->fs_info->extent_root;
5898
5899 root->fs_info->last_trans_new_blockgroup = trans->transid;
5900
5901 cache = kzalloc(sizeof(*cache), GFP_NOFS);
5902 if (!cache)
5903 return -ENOMEM;
5904
5905 cache->key.objectid = chunk_offset;
5906 cache->key.offset = size;
5907 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
5908 atomic_set(&cache->count, 1);
5909 spin_lock_init(&cache->lock);
5910 mutex_init(&cache->alloc_mutex);
5911 mutex_init(&cache->cache_mutex);
5912 INIT_LIST_HEAD(&cache->list);
5913
5914 btrfs_set_block_group_used(&cache->item, bytes_used);
5915 btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
5916 cache->flags = type;
5917 btrfs_set_block_group_flags(&cache->item, type);
5918
5919 ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
5920 &cache->space_info);
5921 BUG_ON(ret);
5922 down_write(&cache->space_info->groups_sem);
5923 list_add_tail(&cache->list, &cache->space_info->block_groups);
5924 up_write(&cache->space_info->groups_sem);
5925
5926 ret = btrfs_add_block_group_cache(root->fs_info, cache);
5927 BUG_ON(ret);
5928
5929 ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item,
5930 sizeof(cache->item));
5931 BUG_ON(ret);
5932
5933 finish_current_insert(trans, extent_root, 0);
5934 ret = del_pending_extents(trans, extent_root, 0);
5935 BUG_ON(ret);
5936 set_avail_alloc_bits(extent_root->fs_info, type);
5937
5938 return 0;
5939}
5940
5941int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
5942 struct btrfs_root *root, u64 group_start)
5943{
5944 struct btrfs_path *path;
5945 struct btrfs_block_group_cache *block_group;
5946 struct btrfs_key key;
5947 int ret;
5948
5949 root = root->fs_info->extent_root;
5950
5951 block_group = btrfs_lookup_block_group(root->fs_info, group_start);
5952 BUG_ON(!block_group);
5953 BUG_ON(!block_group->ro);
5954
5955 memcpy(&key, &block_group->key, sizeof(key));
5956
5957 path = btrfs_alloc_path();
5958 BUG_ON(!path);
5959
5960 btrfs_remove_free_space_cache(block_group);
5961 rb_erase(&block_group->cache_node,
5962 &root->fs_info->block_group_cache_tree);
5963 down_write(&block_group->space_info->groups_sem);
5964 list_del(&block_group->list);
5965 up_write(&block_group->space_info->groups_sem);
5966
5967 spin_lock(&block_group->space_info->lock);
5968 block_group->space_info->total_bytes -= block_group->key.offset;
5969 block_group->space_info->bytes_readonly -= block_group->key.offset;
5970 spin_unlock(&block_group->space_info->lock);
5971 block_group->space_info->full = 0;
5972
5973 put_block_group(block_group);
5974 put_block_group(block_group);
5975
5976 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
5977 if (ret > 0)
5978 ret = -EIO;
5979 if (ret < 0)
5980 goto out;
5981
5982 ret = btrfs_del_item(trans, root, path);
5983out:
5984 btrfs_free_path(path);
5985 return ret;
5986}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
new file mode 100644
index 000000000000..e086d407f1fa
--- /dev/null
+++ b/fs/btrfs/extent_io.c
@@ -0,0 +1,3717 @@
1#include <linux/bitops.h>
2#include <linux/slab.h>
3#include <linux/bio.h>
4#include <linux/mm.h>
5#include <linux/gfp.h>
6#include <linux/pagemap.h>
7#include <linux/page-flags.h>
8#include <linux/module.h>
9#include <linux/spinlock.h>
10#include <linux/blkdev.h>
11#include <linux/swap.h>
12#include <linux/version.h>
13#include <linux/writeback.h>
14#include <linux/pagevec.h>
15#include "extent_io.h"
16#include "extent_map.h"
17#include "compat.h"
18#include "ctree.h"
19#include "btrfs_inode.h"
20
21/* temporary define until extent_map moves out of btrfs */
22struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
23 unsigned long extra_flags,
24 void (*ctor)(void *, struct kmem_cache *,
25 unsigned long));
26
27static struct kmem_cache *extent_state_cache;
28static struct kmem_cache *extent_buffer_cache;
29
30static LIST_HEAD(buffers);
31static LIST_HEAD(states);
32
33#define LEAK_DEBUG 0
34#ifdef LEAK_DEBUG
35static DEFINE_SPINLOCK(leak_lock);
36#endif
37
38#define BUFFER_LRU_MAX 64
39
40struct tree_entry {
41 u64 start;
42 u64 end;
43 struct rb_node rb_node;
44};
45
46struct extent_page_data {
47 struct bio *bio;
48 struct extent_io_tree *tree;
49 get_extent_t *get_extent;
50
51 /* tells writepage not to lock the state bits for this range
52 * it still does the unlocking
53 */
54 int extent_locked;
55};
56
57int __init extent_io_init(void)
58{
59 extent_state_cache = btrfs_cache_create("extent_state",
60 sizeof(struct extent_state), 0,
61 NULL);
62 if (!extent_state_cache)
63 return -ENOMEM;
64
65 extent_buffer_cache = btrfs_cache_create("extent_buffers",
66 sizeof(struct extent_buffer), 0,
67 NULL);
68 if (!extent_buffer_cache)
69 goto free_state_cache;
70 return 0;
71
72free_state_cache:
73 kmem_cache_destroy(extent_state_cache);
74 return -ENOMEM;
75}
76
77void extent_io_exit(void)
78{
79 struct extent_state *state;
80 struct extent_buffer *eb;
81
82 while (!list_empty(&states)) {
83 state = list_entry(states.next, struct extent_state, leak_list);
84 printk(KERN_ERR "btrfs state leak: start %llu end %llu "
85 "state %lu in tree %p refs %d\n",
86 (unsigned long long)state->start,
87 (unsigned long long)state->end,
88 state->state, state->tree, atomic_read(&state->refs));
89 list_del(&state->leak_list);
90 kmem_cache_free(extent_state_cache, state);
91
92 }
93
94 while (!list_empty(&buffers)) {
95 eb = list_entry(buffers.next, struct extent_buffer, leak_list);
96 printk(KERN_ERR "btrfs buffer leak start %llu len %lu "
97 "refs %d\n", (unsigned long long)eb->start,
98 eb->len, atomic_read(&eb->refs));
99 list_del(&eb->leak_list);
100 kmem_cache_free(extent_buffer_cache, eb);
101 }
102 if (extent_state_cache)
103 kmem_cache_destroy(extent_state_cache);
104 if (extent_buffer_cache)
105 kmem_cache_destroy(extent_buffer_cache);
106}
107
108void extent_io_tree_init(struct extent_io_tree *tree,
109 struct address_space *mapping, gfp_t mask)
110{
111 tree->state.rb_node = NULL;
112 tree->buffer.rb_node = NULL;
113 tree->ops = NULL;
114 tree->dirty_bytes = 0;
115 spin_lock_init(&tree->lock);
116 spin_lock_init(&tree->buffer_lock);
117 tree->mapping = mapping;
118}
119
120static struct extent_state *alloc_extent_state(gfp_t mask)
121{
122 struct extent_state *state;
123#ifdef LEAK_DEBUG
124 unsigned long flags;
125#endif
126
127 state = kmem_cache_alloc(extent_state_cache, mask);
128 if (!state)
129 return state;
130 state->state = 0;
131 state->private = 0;
132 state->tree = NULL;
133#ifdef LEAK_DEBUG
134 spin_lock_irqsave(&leak_lock, flags);
135 list_add(&state->leak_list, &states);
136 spin_unlock_irqrestore(&leak_lock, flags);
137#endif
138 atomic_set(&state->refs, 1);
139 init_waitqueue_head(&state->wq);
140 return state;
141}
142
143static void free_extent_state(struct extent_state *state)
144{
145 if (!state)
146 return;
147 if (atomic_dec_and_test(&state->refs)) {
148#ifdef LEAK_DEBUG
149 unsigned long flags;
150#endif
151 WARN_ON(state->tree);
152#ifdef LEAK_DEBUG
153 spin_lock_irqsave(&leak_lock, flags);
154 list_del(&state->leak_list);
155 spin_unlock_irqrestore(&leak_lock, flags);
156#endif
157 kmem_cache_free(extent_state_cache, state);
158 }
159}
160
161static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
162 struct rb_node *node)
163{
164 struct rb_node **p = &root->rb_node;
165 struct rb_node *parent = NULL;
166 struct tree_entry *entry;
167
168 while (*p) {
169 parent = *p;
170 entry = rb_entry(parent, struct tree_entry, rb_node);
171
172 if (offset < entry->start)
173 p = &(*p)->rb_left;
174 else if (offset > entry->end)
175 p = &(*p)->rb_right;
176 else
177 return parent;
178 }
179
180 entry = rb_entry(node, struct tree_entry, rb_node);
181 rb_link_node(node, parent, p);
182 rb_insert_color(node, root);
183 return NULL;
184}
185
186static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
187 struct rb_node **prev_ret,
188 struct rb_node **next_ret)
189{
190 struct rb_root *root = &tree->state;
191 struct rb_node *n = root->rb_node;
192 struct rb_node *prev = NULL;
193 struct rb_node *orig_prev = NULL;
194 struct tree_entry *entry;
195 struct tree_entry *prev_entry = NULL;
196
197 while (n) {
198 entry = rb_entry(n, struct tree_entry, rb_node);
199 prev = n;
200 prev_entry = entry;
201
202 if (offset < entry->start)
203 n = n->rb_left;
204 else if (offset > entry->end)
205 n = n->rb_right;
206 else
207 return n;
208 }
209
210 if (prev_ret) {
211 orig_prev = prev;
212 while (prev && offset > prev_entry->end) {
213 prev = rb_next(prev);
214 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
215 }
216 *prev_ret = prev;
217 prev = orig_prev;
218 }
219
220 if (next_ret) {
221 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
222 while (prev && offset < prev_entry->start) {
223 prev = rb_prev(prev);
224 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
225 }
226 *next_ret = prev;
227 }
228 return NULL;
229}
230
231static inline struct rb_node *tree_search(struct extent_io_tree *tree,
232 u64 offset)
233{
234 struct rb_node *prev = NULL;
235 struct rb_node *ret;
236
237 ret = __etree_search(tree, offset, &prev, NULL);
238 if (!ret)
239 return prev;
240 return ret;
241}
242
243static struct extent_buffer *buffer_tree_insert(struct extent_io_tree *tree,
244 u64 offset, struct rb_node *node)
245{
246 struct rb_root *root = &tree->buffer;
247 struct rb_node **p = &root->rb_node;
248 struct rb_node *parent = NULL;
249 struct extent_buffer *eb;
250
251 while (*p) {
252 parent = *p;
253 eb = rb_entry(parent, struct extent_buffer, rb_node);
254
255 if (offset < eb->start)
256 p = &(*p)->rb_left;
257 else if (offset > eb->start)
258 p = &(*p)->rb_right;
259 else
260 return eb;
261 }
262
263 rb_link_node(node, parent, p);
264 rb_insert_color(node, root);
265 return NULL;
266}
267
268static struct extent_buffer *buffer_search(struct extent_io_tree *tree,
269 u64 offset)
270{
271 struct rb_root *root = &tree->buffer;
272 struct rb_node *n = root->rb_node;
273 struct extent_buffer *eb;
274
275 while (n) {
276 eb = rb_entry(n, struct extent_buffer, rb_node);
277 if (offset < eb->start)
278 n = n->rb_left;
279 else if (offset > eb->start)
280 n = n->rb_right;
281 else
282 return eb;
283 }
284 return NULL;
285}
286
287/*
288 * utility function to look for merge candidates inside a given range.
289 * Any extents with matching state are merged together into a single
290 * extent in the tree. Extents with EXTENT_IO in their state field
291 * are not merged because the end_io handlers need to be able to do
292 * operations on them without sleeping (or doing allocations/splits).
293 *
294 * This should be called with the tree lock held.
295 */
296static int merge_state(struct extent_io_tree *tree,
297 struct extent_state *state)
298{
299 struct extent_state *other;
300 struct rb_node *other_node;
301
302 if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
303 return 0;
304
305 other_node = rb_prev(&state->rb_node);
306 if (other_node) {
307 other = rb_entry(other_node, struct extent_state, rb_node);
308 if (other->end == state->start - 1 &&
309 other->state == state->state) {
310 state->start = other->start;
311 other->tree = NULL;
312 rb_erase(&other->rb_node, &tree->state);
313 free_extent_state(other);
314 }
315 }
316 other_node = rb_next(&state->rb_node);
317 if (other_node) {
318 other = rb_entry(other_node, struct extent_state, rb_node);
319 if (other->start == state->end + 1 &&
320 other->state == state->state) {
321 other->start = state->start;
322 state->tree = NULL;
323 rb_erase(&state->rb_node, &tree->state);
324 free_extent_state(state);
325 }
326 }
327 return 0;
328}
329
330static void set_state_cb(struct extent_io_tree *tree,
331 struct extent_state *state,
332 unsigned long bits)
333{
334 if (tree->ops && tree->ops->set_bit_hook) {
335 tree->ops->set_bit_hook(tree->mapping->host, state->start,
336 state->end, state->state, bits);
337 }
338}
339
340static void clear_state_cb(struct extent_io_tree *tree,
341 struct extent_state *state,
342 unsigned long bits)
343{
344 if (tree->ops && tree->ops->clear_bit_hook) {
345 tree->ops->clear_bit_hook(tree->mapping->host, state->start,
346 state->end, state->state, bits);
347 }
348}
349
350/*
351 * insert an extent_state struct into the tree. 'bits' are set on the
352 * struct before it is inserted.
353 *
354 * This may return -EEXIST if the extent is already there, in which case the
355 * state struct is freed.
356 *
357 * The tree lock is not taken internally. This is a utility function and
358 * probably isn't what you want to call (see set/clear_extent_bit).
359 */
360static int insert_state(struct extent_io_tree *tree,
361 struct extent_state *state, u64 start, u64 end,
362 int bits)
363{
364 struct rb_node *node;
365
366 if (end < start) {
367 printk(KERN_ERR "btrfs end < start %llu %llu\n",
368 (unsigned long long)end,
369 (unsigned long long)start);
370 WARN_ON(1);
371 }
372 if (bits & EXTENT_DIRTY)
373 tree->dirty_bytes += end - start + 1;
374 set_state_cb(tree, state, bits);
375 state->state |= bits;
376 state->start = start;
377 state->end = end;
378 node = tree_insert(&tree->state, end, &state->rb_node);
379 if (node) {
380 struct extent_state *found;
381 found = rb_entry(node, struct extent_state, rb_node);
382 printk(KERN_ERR "btrfs found node %llu %llu on insert of "
383 "%llu %llu\n", (unsigned long long)found->start,
384 (unsigned long long)found->end,
385 (unsigned long long)start, (unsigned long long)end);
386 free_extent_state(state);
387 return -EEXIST;
388 }
389 state->tree = tree;
390 merge_state(tree, state);
391 return 0;
392}
393
394/*
395 * split a given extent state struct in two, inserting the preallocated
396 * struct 'prealloc' as the newly created second half. 'split' indicates an
397 * offset inside 'orig' where it should be split.
398 *
399 * Before calling,
400 * the tree has 'orig' at [orig->start, orig->end]. After calling, there
401 * are two extent state structs in the tree:
402 * prealloc: [orig->start, split - 1]
403 * orig: [ split, orig->end ]
404 *
405 * The tree locks are not taken by this function. They need to be held
406 * by the caller.
407 */
408static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
409 struct extent_state *prealloc, u64 split)
410{
411 struct rb_node *node;
412 prealloc->start = orig->start;
413 prealloc->end = split - 1;
414 prealloc->state = orig->state;
415 orig->start = split;
416
417 node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
418 if (node) {
419 struct extent_state *found;
420 found = rb_entry(node, struct extent_state, rb_node);
421 free_extent_state(prealloc);
422 return -EEXIST;
423 }
424 prealloc->tree = tree;
425 return 0;
426}
427
428/*
429 * utility function to clear some bits in an extent state struct.
430 * it will optionally wake up any one waiting on this state (wake == 1), or
431 * forcibly remove the state from the tree (delete == 1).
432 *
433 * If no bits are set on the state struct after clearing things, the
434 * struct is freed and removed from the tree
435 */
436static int clear_state_bit(struct extent_io_tree *tree,
437 struct extent_state *state, int bits, int wake,
438 int delete)
439{
440 int ret = state->state & bits;
441
442 if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
443 u64 range = state->end - state->start + 1;
444 WARN_ON(range > tree->dirty_bytes);
445 tree->dirty_bytes -= range;
446 }
447 clear_state_cb(tree, state, bits);
448 state->state &= ~bits;
449 if (wake)
450 wake_up(&state->wq);
451 if (delete || state->state == 0) {
452 if (state->tree) {
453 clear_state_cb(tree, state, state->state);
454 rb_erase(&state->rb_node, &tree->state);
455 state->tree = NULL;
456 free_extent_state(state);
457 } else {
458 WARN_ON(1);
459 }
460 } else {
461 merge_state(tree, state);
462 }
463 return ret;
464}
465
466/*
467 * clear some bits on a range in the tree. This may require splitting
468 * or inserting elements in the tree, so the gfp mask is used to
469 * indicate which allocations or sleeping are allowed.
470 *
471 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
472 * the given range from the tree regardless of state (ie for truncate).
473 *
474 * the range [start, end] is inclusive.
475 *
476 * This takes the tree lock, and returns < 0 on error, > 0 if any of the
477 * bits were already set, or zero if none of the bits were already set.
478 */
479int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
480 int bits, int wake, int delete, gfp_t mask)
481{
482 struct extent_state *state;
483 struct extent_state *prealloc = NULL;
484 struct rb_node *node;
485 int err;
486 int set = 0;
487
488again:
489 if (!prealloc && (mask & __GFP_WAIT)) {
490 prealloc = alloc_extent_state(mask);
491 if (!prealloc)
492 return -ENOMEM;
493 }
494
495 spin_lock(&tree->lock);
496 /*
497 * this search will find the extents that end after
498 * our range starts
499 */
500 node = tree_search(tree, start);
501 if (!node)
502 goto out;
503 state = rb_entry(node, struct extent_state, rb_node);
504 if (state->start > end)
505 goto out;
506 WARN_ON(state->end < start);
507
508 /*
509 * | ---- desired range ---- |
510 * | state | or
511 * | ------------- state -------------- |
512 *
513 * We need to split the extent we found, and may flip
514 * bits on second half.
515 *
516 * If the extent we found extends past our range, we
517 * just split and search again. It'll get split again
518 * the next time though.
519 *
520 * If the extent we found is inside our range, we clear
521 * the desired bit on it.
522 */
523
524 if (state->start < start) {
525 if (!prealloc)
526 prealloc = alloc_extent_state(GFP_ATOMIC);
527 err = split_state(tree, state, prealloc, start);
528 BUG_ON(err == -EEXIST);
529 prealloc = NULL;
530 if (err)
531 goto out;
532 if (state->end <= end) {
533 start = state->end + 1;
534 set |= clear_state_bit(tree, state, bits,
535 wake, delete);
536 } else {
537 start = state->start;
538 }
539 goto search_again;
540 }
541 /*
542 * | ---- desired range ---- |
543 * | state |
544 * We need to split the extent, and clear the bit
545 * on the first half
546 */
547 if (state->start <= end && state->end > end) {
548 if (!prealloc)
549 prealloc = alloc_extent_state(GFP_ATOMIC);
550 err = split_state(tree, state, prealloc, end + 1);
551 BUG_ON(err == -EEXIST);
552
553 if (wake)
554 wake_up(&state->wq);
555 set |= clear_state_bit(tree, prealloc, bits,
556 wake, delete);
557 prealloc = NULL;
558 goto out;
559 }
560
561 start = state->end + 1;
562 set |= clear_state_bit(tree, state, bits, wake, delete);
563 goto search_again;
564
565out:
566 spin_unlock(&tree->lock);
567 if (prealloc)
568 free_extent_state(prealloc);
569
570 return set;
571
572search_again:
573 if (start > end)
574 goto out;
575 spin_unlock(&tree->lock);
576 if (mask & __GFP_WAIT)
577 cond_resched();
578 goto again;
579}
580
581static int wait_on_state(struct extent_io_tree *tree,
582 struct extent_state *state)
583 __releases(tree->lock)
584 __acquires(tree->lock)
585{
586 DEFINE_WAIT(wait);
587 prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
588 spin_unlock(&tree->lock);
589 schedule();
590 spin_lock(&tree->lock);
591 finish_wait(&state->wq, &wait);
592 return 0;
593}
594
595/*
596 * waits for one or more bits to clear on a range in the state tree.
597 * The range [start, end] is inclusive.
598 * The tree lock is taken by this function
599 */
600int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
601{
602 struct extent_state *state;
603 struct rb_node *node;
604
605 spin_lock(&tree->lock);
606again:
607 while (1) {
608 /*
609 * this search will find all the extents that end after
610 * our range starts
611 */
612 node = tree_search(tree, start);
613 if (!node)
614 break;
615
616 state = rb_entry(node, struct extent_state, rb_node);
617
618 if (state->start > end)
619 goto out;
620
621 if (state->state & bits) {
622 start = state->start;
623 atomic_inc(&state->refs);
624 wait_on_state(tree, state);
625 free_extent_state(state);
626 goto again;
627 }
628 start = state->end + 1;
629
630 if (start > end)
631 break;
632
633 if (need_resched()) {
634 spin_unlock(&tree->lock);
635 cond_resched();
636 spin_lock(&tree->lock);
637 }
638 }
639out:
640 spin_unlock(&tree->lock);
641 return 0;
642}
643
644static void set_state_bits(struct extent_io_tree *tree,
645 struct extent_state *state,
646 int bits)
647{
648 if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
649 u64 range = state->end - state->start + 1;
650 tree->dirty_bytes += range;
651 }
652 set_state_cb(tree, state, bits);
653 state->state |= bits;
654}
655
656/*
657 * set some bits on a range in the tree. This may require allocations
658 * or sleeping, so the gfp mask is used to indicate what is allowed.
659 *
660 * If 'exclusive' == 1, this will fail with -EEXIST if some part of the
661 * range already has the desired bits set. The start of the existing
662 * range is returned in failed_start in this case.
663 *
664 * [start, end] is inclusive
665 * This takes the tree lock.
666 */
667static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
668 int bits, int exclusive, u64 *failed_start,
669 gfp_t mask)
670{
671 struct extent_state *state;
672 struct extent_state *prealloc = NULL;
673 struct rb_node *node;
674 int err = 0;
675 int set;
676 u64 last_start;
677 u64 last_end;
678again:
679 if (!prealloc && (mask & __GFP_WAIT)) {
680 prealloc = alloc_extent_state(mask);
681 if (!prealloc)
682 return -ENOMEM;
683 }
684
685 spin_lock(&tree->lock);
686 /*
687 * this search will find all the extents that end after
688 * our range starts.
689 */
690 node = tree_search(tree, start);
691 if (!node) {
692 err = insert_state(tree, prealloc, start, end, bits);
693 prealloc = NULL;
694 BUG_ON(err == -EEXIST);
695 goto out;
696 }
697
698 state = rb_entry(node, struct extent_state, rb_node);
699 last_start = state->start;
700 last_end = state->end;
701
702 /*
703 * | ---- desired range ---- |
704 * | state |
705 *
706 * Just lock what we found and keep going
707 */
708 if (state->start == start && state->end <= end) {
709 set = state->state & bits;
710 if (set && exclusive) {
711 *failed_start = state->start;
712 err = -EEXIST;
713 goto out;
714 }
715 set_state_bits(tree, state, bits);
716 start = state->end + 1;
717 merge_state(tree, state);
718 goto search_again;
719 }
720
721 /*
722 * | ---- desired range ---- |
723 * | state |
724 * or
725 * | ------------- state -------------- |
726 *
727 * We need to split the extent we found, and may flip bits on
728 * second half.
729 *
730 * If the extent we found extends past our
731 * range, we just split and search again. It'll get split
732 * again the next time though.
733 *
734 * If the extent we found is inside our range, we set the
735 * desired bit on it.
736 */
737 if (state->start < start) {
738 set = state->state & bits;
739 if (exclusive && set) {
740 *failed_start = start;
741 err = -EEXIST;
742 goto out;
743 }
744 err = split_state(tree, state, prealloc, start);
745 BUG_ON(err == -EEXIST);
746 prealloc = NULL;
747 if (err)
748 goto out;
749 if (state->end <= end) {
750 set_state_bits(tree, state, bits);
751 start = state->end + 1;
752 merge_state(tree, state);
753 } else {
754 start = state->start;
755 }
756 goto search_again;
757 }
758 /*
759 * | ---- desired range ---- |
760 * | state | or | state |
761 *
762 * There's a hole, we need to insert something in it and
763 * ignore the extent we found.
764 */
765 if (state->start > start) {
766 u64 this_end;
767 if (end < last_start)
768 this_end = end;
769 else
770 this_end = last_start - 1;
771 err = insert_state(tree, prealloc, start, this_end,
772 bits);
773 prealloc = NULL;
774 BUG_ON(err == -EEXIST);
775 if (err)
776 goto out;
777 start = this_end + 1;
778 goto search_again;
779 }
780 /*
781 * | ---- desired range ---- |
782 * | state |
783 * We need to split the extent, and set the bit
784 * on the first half
785 */
786 if (state->start <= end && state->end > end) {
787 set = state->state & bits;
788 if (exclusive && set) {
789 *failed_start = start;
790 err = -EEXIST;
791 goto out;
792 }
793 err = split_state(tree, state, prealloc, end + 1);
794 BUG_ON(err == -EEXIST);
795
796 set_state_bits(tree, prealloc, bits);
797 merge_state(tree, prealloc);
798 prealloc = NULL;
799 goto out;
800 }
801
802 goto search_again;
803
804out:
805 spin_unlock(&tree->lock);
806 if (prealloc)
807 free_extent_state(prealloc);
808
809 return err;
810
811search_again:
812 if (start > end)
813 goto out;
814 spin_unlock(&tree->lock);
815 if (mask & __GFP_WAIT)
816 cond_resched();
817 goto again;
818}
819
820/* wrappers around set/clear extent bit */
821int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
822 gfp_t mask)
823{
824 return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
825 mask);
826}
827
828int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
829 gfp_t mask)
830{
831 return set_extent_bit(tree, start, end, EXTENT_ORDERED, 0, NULL, mask);
832}
833
834int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
835 int bits, gfp_t mask)
836{
837 return set_extent_bit(tree, start, end, bits, 0, NULL,
838 mask);
839}
840
841int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
842 int bits, gfp_t mask)
843{
844 return clear_extent_bit(tree, start, end, bits, 0, 0, mask);
845}
846
847int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
848 gfp_t mask)
849{
850 return set_extent_bit(tree, start, end,
851 EXTENT_DELALLOC | EXTENT_DIRTY,
852 0, NULL, mask);
853}
854
855int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
856 gfp_t mask)
857{
858 return clear_extent_bit(tree, start, end,
859 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask);
860}
861
862int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
863 gfp_t mask)
864{
865 return clear_extent_bit(tree, start, end, EXTENT_ORDERED, 1, 0, mask);
866}
867
868int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
869 gfp_t mask)
870{
871 return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
872 mask);
873}
874
875static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
876 gfp_t mask)
877{
878 return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask);
879}
880
881int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
882 gfp_t mask)
883{
884 return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
885 mask);
886}
887
888static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
889 u64 end, gfp_t mask)
890{
891 return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask);
892}
893
894static int set_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
895 gfp_t mask)
896{
897 return set_extent_bit(tree, start, end, EXTENT_WRITEBACK,
898 0, NULL, mask);
899}
900
901static int clear_extent_writeback(struct extent_io_tree *tree, u64 start,
902 u64 end, gfp_t mask)
903{
904 return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask);
905}
906
907int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
908{
909 return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK);
910}
911
912/*
913 * either insert or lock state struct between start and end use mask to tell
914 * us if waiting is desired.
915 */
916int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
917{
918 int err;
919 u64 failed_start;
920 while (1) {
921 err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
922 &failed_start, mask);
923 if (err == -EEXIST && (mask & __GFP_WAIT)) {
924 wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
925 start = failed_start;
926 } else {
927 break;
928 }
929 WARN_ON(start > end);
930 }
931 return err;
932}
933
934int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
935 gfp_t mask)
936{
937 int err;
938 u64 failed_start;
939
940 err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
941 &failed_start, mask);
942 if (err == -EEXIST) {
943 if (failed_start > start)
944 clear_extent_bit(tree, start, failed_start - 1,
945 EXTENT_LOCKED, 1, 0, mask);
946 return 0;
947 }
948 return 1;
949}
950
951int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
952 gfp_t mask)
953{
954 return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask);
955}
956
957/*
958 * helper function to set pages and extents in the tree dirty
959 */
960int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end)
961{
962 unsigned long index = start >> PAGE_CACHE_SHIFT;
963 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
964 struct page *page;
965
966 while (index <= end_index) {
967 page = find_get_page(tree->mapping, index);
968 BUG_ON(!page);
969 __set_page_dirty_nobuffers(page);
970 page_cache_release(page);
971 index++;
972 }
973 set_extent_dirty(tree, start, end, GFP_NOFS);
974 return 0;
975}
976
977/*
978 * helper function to set both pages and extents in the tree writeback
979 */
980static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
981{
982 unsigned long index = start >> PAGE_CACHE_SHIFT;
983 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
984 struct page *page;
985
986 while (index <= end_index) {
987 page = find_get_page(tree->mapping, index);
988 BUG_ON(!page);
989 set_page_writeback(page);
990 page_cache_release(page);
991 index++;
992 }
993 set_extent_writeback(tree, start, end, GFP_NOFS);
994 return 0;
995}
996
997/*
998 * find the first offset in the io tree with 'bits' set. zero is
999 * returned if we find something, and *start_ret and *end_ret are
1000 * set to reflect the state struct that was found.
1001 *
1002 * If nothing was found, 1 is returned, < 0 on error
1003 */
1004int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
1005 u64 *start_ret, u64 *end_ret, int bits)
1006{
1007 struct rb_node *node;
1008 struct extent_state *state;
1009 int ret = 1;
1010
1011 spin_lock(&tree->lock);
1012 /*
1013 * this search will find all the extents that end after
1014 * our range starts.
1015 */
1016 node = tree_search(tree, start);
1017 if (!node)
1018 goto out;
1019
1020 while (1) {
1021 state = rb_entry(node, struct extent_state, rb_node);
1022 if (state->end >= start && (state->state & bits)) {
1023 *start_ret = state->start;
1024 *end_ret = state->end;
1025 ret = 0;
1026 break;
1027 }
1028 node = rb_next(node);
1029 if (!node)
1030 break;
1031 }
1032out:
1033 spin_unlock(&tree->lock);
1034 return ret;
1035}
1036
1037/* find the first state struct with 'bits' set after 'start', and
1038 * return it. tree->lock must be held. NULL will returned if
1039 * nothing was found after 'start'
1040 */
1041struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
1042 u64 start, int bits)
1043{
1044 struct rb_node *node;
1045 struct extent_state *state;
1046
1047 /*
1048 * this search will find all the extents that end after
1049 * our range starts.
1050 */
1051 node = tree_search(tree, start);
1052 if (!node)
1053 goto out;
1054
1055 while (1) {
1056 state = rb_entry(node, struct extent_state, rb_node);
1057 if (state->end >= start && (state->state & bits))
1058 return state;
1059
1060 node = rb_next(node);
1061 if (!node)
1062 break;
1063 }
1064out:
1065 return NULL;
1066}
1067
1068/*
1069 * find a contiguous range of bytes in the file marked as delalloc, not
1070 * more than 'max_bytes'. start and end are used to return the range,
1071 *
1072 * 1 is returned if we find something, 0 if nothing was in the tree
1073 */
1074static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
1075 u64 *start, u64 *end, u64 max_bytes)
1076{
1077 struct rb_node *node;
1078 struct extent_state *state;
1079 u64 cur_start = *start;
1080 u64 found = 0;
1081 u64 total_bytes = 0;
1082
1083 spin_lock(&tree->lock);
1084
1085 /*
1086 * this search will find all the extents that end after
1087 * our range starts.
1088 */
1089 node = tree_search(tree, cur_start);
1090 if (!node) {
1091 if (!found)
1092 *end = (u64)-1;
1093 goto out;
1094 }
1095
1096 while (1) {
1097 state = rb_entry(node, struct extent_state, rb_node);
1098 if (found && (state->start != cur_start ||
1099 (state->state & EXTENT_BOUNDARY))) {
1100 goto out;
1101 }
1102 if (!(state->state & EXTENT_DELALLOC)) {
1103 if (!found)
1104 *end = state->end;
1105 goto out;
1106 }
1107 if (!found)
1108 *start = state->start;
1109 found++;
1110 *end = state->end;
1111 cur_start = state->end + 1;
1112 node = rb_next(node);
1113 if (!node)
1114 break;
1115 total_bytes += state->end - state->start + 1;
1116 if (total_bytes >= max_bytes)
1117 break;
1118 }
1119out:
1120 spin_unlock(&tree->lock);
1121 return found;
1122}
1123
1124static noinline int __unlock_for_delalloc(struct inode *inode,
1125 struct page *locked_page,
1126 u64 start, u64 end)
1127{
1128 int ret;
1129 struct page *pages[16];
1130 unsigned long index = start >> PAGE_CACHE_SHIFT;
1131 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1132 unsigned long nr_pages = end_index - index + 1;
1133 int i;
1134
1135 if (index == locked_page->index && end_index == index)
1136 return 0;
1137
1138 while (nr_pages > 0) {
1139 ret = find_get_pages_contig(inode->i_mapping, index,
1140 min_t(unsigned long, nr_pages,
1141 ARRAY_SIZE(pages)), pages);
1142 for (i = 0; i < ret; i++) {
1143 if (pages[i] != locked_page)
1144 unlock_page(pages[i]);
1145 page_cache_release(pages[i]);
1146 }
1147 nr_pages -= ret;
1148 index += ret;
1149 cond_resched();
1150 }
1151 return 0;
1152}
1153
1154static noinline int lock_delalloc_pages(struct inode *inode,
1155 struct page *locked_page,
1156 u64 delalloc_start,
1157 u64 delalloc_end)
1158{
1159 unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT;
1160 unsigned long start_index = index;
1161 unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT;
1162 unsigned long pages_locked = 0;
1163 struct page *pages[16];
1164 unsigned long nrpages;
1165 int ret;
1166 int i;
1167
1168 /* the caller is responsible for locking the start index */
1169 if (index == locked_page->index && index == end_index)
1170 return 0;
1171
1172 /* skip the page at the start index */
1173 nrpages = end_index - index + 1;
1174 while (nrpages > 0) {
1175 ret = find_get_pages_contig(inode->i_mapping, index,
1176 min_t(unsigned long,
1177 nrpages, ARRAY_SIZE(pages)), pages);
1178 if (ret == 0) {
1179 ret = -EAGAIN;
1180 goto done;
1181 }
1182 /* now we have an array of pages, lock them all */
1183 for (i = 0; i < ret; i++) {
1184 /*
1185 * the caller is taking responsibility for
1186 * locked_page
1187 */
1188 if (pages[i] != locked_page) {
1189 lock_page(pages[i]);
1190 if (!PageDirty(pages[i]) ||
1191 pages[i]->mapping != inode->i_mapping) {
1192 ret = -EAGAIN;
1193 unlock_page(pages[i]);
1194 page_cache_release(pages[i]);
1195 goto done;
1196 }
1197 }
1198 page_cache_release(pages[i]);
1199 pages_locked++;
1200 }
1201 nrpages -= ret;
1202 index += ret;
1203 cond_resched();
1204 }
1205 ret = 0;
1206done:
1207 if (ret && pages_locked) {
1208 __unlock_for_delalloc(inode, locked_page,
1209 delalloc_start,
1210 ((u64)(start_index + pages_locked - 1)) <<
1211 PAGE_CACHE_SHIFT);
1212 }
1213 return ret;
1214}
1215
1216/*
1217 * find a contiguous range of bytes in the file marked as delalloc, not
1218 * more than 'max_bytes'. start and end are used to return the range,
1219 *
1220 * 1 is returned if we find something, 0 if nothing was in the tree
1221 */
1222static noinline u64 find_lock_delalloc_range(struct inode *inode,
1223 struct extent_io_tree *tree,
1224 struct page *locked_page,
1225 u64 *start, u64 *end,
1226 u64 max_bytes)
1227{
1228 u64 delalloc_start;
1229 u64 delalloc_end;
1230 u64 found;
1231 int ret;
1232 int loops = 0;
1233
1234again:
1235 /* step one, find a bunch of delalloc bytes starting at start */
1236 delalloc_start = *start;
1237 delalloc_end = 0;
1238 found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
1239 max_bytes);
1240 if (!found || delalloc_end <= *start) {
1241 *start = delalloc_start;
1242 *end = delalloc_end;
1243 return found;
1244 }
1245
1246 /*
1247 * start comes from the offset of locked_page. We have to lock
1248 * pages in order, so we can't process delalloc bytes before
1249 * locked_page
1250 */
1251 if (delalloc_start < *start)
1252 delalloc_start = *start;
1253
1254 /*
1255 * make sure to limit the number of pages we try to lock down
1256 * if we're looping.
1257 */
1258 if (delalloc_end + 1 - delalloc_start > max_bytes && loops)
1259 delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1;
1260
1261 /* step two, lock all the pages after the page that has start */
1262 ret = lock_delalloc_pages(inode, locked_page,
1263 delalloc_start, delalloc_end);
1264 if (ret == -EAGAIN) {
1265 /* some of the pages are gone, lets avoid looping by
1266 * shortening the size of the delalloc range we're searching
1267 */
1268 if (!loops) {
1269 unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
1270 max_bytes = PAGE_CACHE_SIZE - offset;
1271 loops = 1;
1272 goto again;
1273 } else {
1274 found = 0;
1275 goto out_failed;
1276 }
1277 }
1278 BUG_ON(ret);
1279
1280 /* step three, lock the state bits for the whole range */
1281 lock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
1282
1283 /* then test to make sure it is all still delalloc */
1284 ret = test_range_bit(tree, delalloc_start, delalloc_end,
1285 EXTENT_DELALLOC, 1);
1286 if (!ret) {
1287 unlock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
1288 __unlock_for_delalloc(inode, locked_page,
1289 delalloc_start, delalloc_end);
1290 cond_resched();
1291 goto again;
1292 }
1293 *start = delalloc_start;
1294 *end = delalloc_end;
1295out_failed:
1296 return found;
1297}
1298
1299int extent_clear_unlock_delalloc(struct inode *inode,
1300 struct extent_io_tree *tree,
1301 u64 start, u64 end, struct page *locked_page,
1302 int unlock_pages,
1303 int clear_unlock,
1304 int clear_delalloc, int clear_dirty,
1305 int set_writeback,
1306 int end_writeback)
1307{
1308 int ret;
1309 struct page *pages[16];
1310 unsigned long index = start >> PAGE_CACHE_SHIFT;
1311 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1312 unsigned long nr_pages = end_index - index + 1;
1313 int i;
1314 int clear_bits = 0;
1315
1316 if (clear_unlock)
1317 clear_bits |= EXTENT_LOCKED;
1318 if (clear_dirty)
1319 clear_bits |= EXTENT_DIRTY;
1320
1321 if (clear_delalloc)
1322 clear_bits |= EXTENT_DELALLOC;
1323
1324 clear_extent_bit(tree, start, end, clear_bits, 1, 0, GFP_NOFS);
1325 if (!(unlock_pages || clear_dirty || set_writeback || end_writeback))
1326 return 0;
1327
1328 while (nr_pages > 0) {
1329 ret = find_get_pages_contig(inode->i_mapping, index,
1330 min_t(unsigned long,
1331 nr_pages, ARRAY_SIZE(pages)), pages);
1332 for (i = 0; i < ret; i++) {
1333 if (pages[i] == locked_page) {
1334 page_cache_release(pages[i]);
1335 continue;
1336 }
1337 if (clear_dirty)
1338 clear_page_dirty_for_io(pages[i]);
1339 if (set_writeback)
1340 set_page_writeback(pages[i]);
1341 if (end_writeback)
1342 end_page_writeback(pages[i]);
1343 if (unlock_pages)
1344 unlock_page(pages[i]);
1345 page_cache_release(pages[i]);
1346 }
1347 nr_pages -= ret;
1348 index += ret;
1349 cond_resched();
1350 }
1351 return 0;
1352}
1353
1354/*
1355 * count the number of bytes in the tree that have a given bit(s)
1356 * set. This can be fairly slow, except for EXTENT_DIRTY which is
1357 * cached. The total number found is returned.
1358 */
1359u64 count_range_bits(struct extent_io_tree *tree,
1360 u64 *start, u64 search_end, u64 max_bytes,
1361 unsigned long bits)
1362{
1363 struct rb_node *node;
1364 struct extent_state *state;
1365 u64 cur_start = *start;
1366 u64 total_bytes = 0;
1367 int found = 0;
1368
1369 if (search_end <= cur_start) {
1370 WARN_ON(1);
1371 return 0;
1372 }
1373
1374 spin_lock(&tree->lock);
1375 if (cur_start == 0 && bits == EXTENT_DIRTY) {
1376 total_bytes = tree->dirty_bytes;
1377 goto out;
1378 }
1379 /*
1380 * this search will find all the extents that end after
1381 * our range starts.
1382 */
1383 node = tree_search(tree, cur_start);
1384 if (!node)
1385 goto out;
1386
1387 while (1) {
1388 state = rb_entry(node, struct extent_state, rb_node);
1389 if (state->start > search_end)
1390 break;
1391 if (state->end >= cur_start && (state->state & bits)) {
1392 total_bytes += min(search_end, state->end) + 1 -
1393 max(cur_start, state->start);
1394 if (total_bytes >= max_bytes)
1395 break;
1396 if (!found) {
1397 *start = state->start;
1398 found = 1;
1399 }
1400 }
1401 node = rb_next(node);
1402 if (!node)
1403 break;
1404 }
1405out:
1406 spin_unlock(&tree->lock);
1407 return total_bytes;
1408}
1409
1410#if 0
1411/*
1412 * helper function to lock both pages and extents in the tree.
1413 * pages must be locked first.
1414 */
1415static int lock_range(struct extent_io_tree *tree, u64 start, u64 end)
1416{
1417 unsigned long index = start >> PAGE_CACHE_SHIFT;
1418 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1419 struct page *page;
1420 int err;
1421
1422 while (index <= end_index) {
1423 page = grab_cache_page(tree->mapping, index);
1424 if (!page) {
1425 err = -ENOMEM;
1426 goto failed;
1427 }
1428 if (IS_ERR(page)) {
1429 err = PTR_ERR(page);
1430 goto failed;
1431 }
1432 index++;
1433 }
1434 lock_extent(tree, start, end, GFP_NOFS);
1435 return 0;
1436
1437failed:
1438 /*
1439 * we failed above in getting the page at 'index', so we undo here
1440 * up to but not including the page at 'index'
1441 */
1442 end_index = index;
1443 index = start >> PAGE_CACHE_SHIFT;
1444 while (index < end_index) {
1445 page = find_get_page(tree->mapping, index);
1446 unlock_page(page);
1447 page_cache_release(page);
1448 index++;
1449 }
1450 return err;
1451}
1452
1453/*
1454 * helper function to unlock both pages and extents in the tree.
1455 */
1456static int unlock_range(struct extent_io_tree *tree, u64 start, u64 end)
1457{
1458 unsigned long index = start >> PAGE_CACHE_SHIFT;
1459 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1460 struct page *page;
1461
1462 while (index <= end_index) {
1463 page = find_get_page(tree->mapping, index);
1464 unlock_page(page);
1465 page_cache_release(page);
1466 index++;
1467 }
1468 unlock_extent(tree, start, end, GFP_NOFS);
1469 return 0;
1470}
1471#endif
1472
1473/*
1474 * set the private field for a given byte offset in the tree. If there isn't
1475 * an extent_state there already, this does nothing.
1476 */
1477int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
1478{
1479 struct rb_node *node;
1480 struct extent_state *state;
1481 int ret = 0;
1482
1483 spin_lock(&tree->lock);
1484 /*
1485 * this search will find all the extents that end after
1486 * our range starts.
1487 */
1488 node = tree_search(tree, start);
1489 if (!node) {
1490 ret = -ENOENT;
1491 goto out;
1492 }
1493 state = rb_entry(node, struct extent_state, rb_node);
1494 if (state->start != start) {
1495 ret = -ENOENT;
1496 goto out;
1497 }
1498 state->private = private;
1499out:
1500 spin_unlock(&tree->lock);
1501 return ret;
1502}
1503
1504int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
1505{
1506 struct rb_node *node;
1507 struct extent_state *state;
1508 int ret = 0;
1509
1510 spin_lock(&tree->lock);
1511 /*
1512 * this search will find all the extents that end after
1513 * our range starts.
1514 */
1515 node = tree_search(tree, start);
1516 if (!node) {
1517 ret = -ENOENT;
1518 goto out;
1519 }
1520 state = rb_entry(node, struct extent_state, rb_node);
1521 if (state->start != start) {
1522 ret = -ENOENT;
1523 goto out;
1524 }
1525 *private = state->private;
1526out:
1527 spin_unlock(&tree->lock);
1528 return ret;
1529}
1530
1531/*
1532 * searches a range in the state tree for a given mask.
1533 * If 'filled' == 1, this returns 1 only if every extent in the tree
1534 * has the bits set. Otherwise, 1 is returned if any bit in the
1535 * range is found set.
1536 */
1537int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
1538 int bits, int filled)
1539{
1540 struct extent_state *state = NULL;
1541 struct rb_node *node;
1542 int bitset = 0;
1543
1544 spin_lock(&tree->lock);
1545 node = tree_search(tree, start);
1546 while (node && start <= end) {
1547 state = rb_entry(node, struct extent_state, rb_node);
1548
1549 if (filled && state->start > start) {
1550 bitset = 0;
1551 break;
1552 }
1553
1554 if (state->start > end)
1555 break;
1556
1557 if (state->state & bits) {
1558 bitset = 1;
1559 if (!filled)
1560 break;
1561 } else if (filled) {
1562 bitset = 0;
1563 break;
1564 }
1565 start = state->end + 1;
1566 if (start > end)
1567 break;
1568 node = rb_next(node);
1569 if (!node) {
1570 if (filled)
1571 bitset = 0;
1572 break;
1573 }
1574 }
1575 spin_unlock(&tree->lock);
1576 return bitset;
1577}
1578
1579/*
1580 * helper function to set a given page up to date if all the
1581 * extents in the tree for that page are up to date
1582 */
1583static int check_page_uptodate(struct extent_io_tree *tree,
1584 struct page *page)
1585{
1586 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1587 u64 end = start + PAGE_CACHE_SIZE - 1;
1588 if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1))
1589 SetPageUptodate(page);
1590 return 0;
1591}
1592
1593/*
1594 * helper function to unlock a page if all the extents in the tree
1595 * for that page are unlocked
1596 */
1597static int check_page_locked(struct extent_io_tree *tree,
1598 struct page *page)
1599{
1600 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1601 u64 end = start + PAGE_CACHE_SIZE - 1;
1602 if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0))
1603 unlock_page(page);
1604 return 0;
1605}
1606
1607/*
1608 * helper function to end page writeback if all the extents
1609 * in the tree for that page are done with writeback
1610 */
1611static int check_page_writeback(struct extent_io_tree *tree,
1612 struct page *page)
1613{
1614 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1615 u64 end = start + PAGE_CACHE_SIZE - 1;
1616 if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0))
1617 end_page_writeback(page);
1618 return 0;
1619}
1620
1621/* lots and lots of room for performance fixes in the end_bio funcs */
1622
1623/*
1624 * after a writepage IO is done, we need to:
1625 * clear the uptodate bits on error
1626 * clear the writeback bits in the extent tree for this IO
1627 * end_page_writeback if the page has no more pending IO
1628 *
1629 * Scheduling is not allowed, so the extent state tree is expected
1630 * to have one and only one object corresponding to this IO.
1631 */
1632static void end_bio_extent_writepage(struct bio *bio, int err)
1633{
1634 int uptodate = err == 0;
1635 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1636 struct extent_io_tree *tree;
1637 u64 start;
1638 u64 end;
1639 int whole_page;
1640 int ret;
1641
1642 do {
1643 struct page *page = bvec->bv_page;
1644 tree = &BTRFS_I(page->mapping->host)->io_tree;
1645
1646 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1647 bvec->bv_offset;
1648 end = start + bvec->bv_len - 1;
1649
1650 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1651 whole_page = 1;
1652 else
1653 whole_page = 0;
1654
1655 if (--bvec >= bio->bi_io_vec)
1656 prefetchw(&bvec->bv_page->flags);
1657 if (tree->ops && tree->ops->writepage_end_io_hook) {
1658 ret = tree->ops->writepage_end_io_hook(page, start,
1659 end, NULL, uptodate);
1660 if (ret)
1661 uptodate = 0;
1662 }
1663
1664 if (!uptodate && tree->ops &&
1665 tree->ops->writepage_io_failed_hook) {
1666 ret = tree->ops->writepage_io_failed_hook(bio, page,
1667 start, end, NULL);
1668 if (ret == 0) {
1669 uptodate = (err == 0);
1670 continue;
1671 }
1672 }
1673
1674 if (!uptodate) {
1675 clear_extent_uptodate(tree, start, end, GFP_ATOMIC);
1676 ClearPageUptodate(page);
1677 SetPageError(page);
1678 }
1679
1680 clear_extent_writeback(tree, start, end, GFP_ATOMIC);
1681
1682 if (whole_page)
1683 end_page_writeback(page);
1684 else
1685 check_page_writeback(tree, page);
1686 } while (bvec >= bio->bi_io_vec);
1687
1688 bio_put(bio);
1689}
1690
1691/*
1692 * after a readpage IO is done, we need to:
1693 * clear the uptodate bits on error
1694 * set the uptodate bits if things worked
1695 * set the page up to date if all extents in the tree are uptodate
1696 * clear the lock bit in the extent tree
1697 * unlock the page if there are no other extents locked for it
1698 *
1699 * Scheduling is not allowed, so the extent state tree is expected
1700 * to have one and only one object corresponding to this IO.
1701 */
1702static void end_bio_extent_readpage(struct bio *bio, int err)
1703{
1704 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1705 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1706 struct extent_io_tree *tree;
1707 u64 start;
1708 u64 end;
1709 int whole_page;
1710 int ret;
1711
1712 if (err)
1713 uptodate = 0;
1714
1715 do {
1716 struct page *page = bvec->bv_page;
1717 tree = &BTRFS_I(page->mapping->host)->io_tree;
1718
1719 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1720 bvec->bv_offset;
1721 end = start + bvec->bv_len - 1;
1722
1723 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1724 whole_page = 1;
1725 else
1726 whole_page = 0;
1727
1728 if (--bvec >= bio->bi_io_vec)
1729 prefetchw(&bvec->bv_page->flags);
1730
1731 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
1732 ret = tree->ops->readpage_end_io_hook(page, start, end,
1733 NULL);
1734 if (ret)
1735 uptodate = 0;
1736 }
1737 if (!uptodate && tree->ops &&
1738 tree->ops->readpage_io_failed_hook) {
1739 ret = tree->ops->readpage_io_failed_hook(bio, page,
1740 start, end, NULL);
1741 if (ret == 0) {
1742 uptodate =
1743 test_bit(BIO_UPTODATE, &bio->bi_flags);
1744 if (err)
1745 uptodate = 0;
1746 continue;
1747 }
1748 }
1749
1750 if (uptodate) {
1751 set_extent_uptodate(tree, start, end,
1752 GFP_ATOMIC);
1753 }
1754 unlock_extent(tree, start, end, GFP_ATOMIC);
1755
1756 if (whole_page) {
1757 if (uptodate) {
1758 SetPageUptodate(page);
1759 } else {
1760 ClearPageUptodate(page);
1761 SetPageError(page);
1762 }
1763 unlock_page(page);
1764 } else {
1765 if (uptodate) {
1766 check_page_uptodate(tree, page);
1767 } else {
1768 ClearPageUptodate(page);
1769 SetPageError(page);
1770 }
1771 check_page_locked(tree, page);
1772 }
1773 } while (bvec >= bio->bi_io_vec);
1774
1775 bio_put(bio);
1776}
1777
1778/*
1779 * IO done from prepare_write is pretty simple, we just unlock
1780 * the structs in the extent tree when done, and set the uptodate bits
1781 * as appropriate.
1782 */
1783static void end_bio_extent_preparewrite(struct bio *bio, int err)
1784{
1785 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1786 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1787 struct extent_io_tree *tree;
1788 u64 start;
1789 u64 end;
1790
1791 do {
1792 struct page *page = bvec->bv_page;
1793 tree = &BTRFS_I(page->mapping->host)->io_tree;
1794
1795 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1796 bvec->bv_offset;
1797 end = start + bvec->bv_len - 1;
1798
1799 if (--bvec >= bio->bi_io_vec)
1800 prefetchw(&bvec->bv_page->flags);
1801
1802 if (uptodate) {
1803 set_extent_uptodate(tree, start, end, GFP_ATOMIC);
1804 } else {
1805 ClearPageUptodate(page);
1806 SetPageError(page);
1807 }
1808
1809 unlock_extent(tree, start, end, GFP_ATOMIC);
1810
1811 } while (bvec >= bio->bi_io_vec);
1812
1813 bio_put(bio);
1814}
1815
1816static struct bio *
1817extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
1818 gfp_t gfp_flags)
1819{
1820 struct bio *bio;
1821
1822 bio = bio_alloc(gfp_flags, nr_vecs);
1823
1824 if (bio == NULL && (current->flags & PF_MEMALLOC)) {
1825 while (!bio && (nr_vecs /= 2))
1826 bio = bio_alloc(gfp_flags, nr_vecs);
1827 }
1828
1829 if (bio) {
1830 bio->bi_size = 0;
1831 bio->bi_bdev = bdev;
1832 bio->bi_sector = first_sector;
1833 }
1834 return bio;
1835}
1836
1837static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
1838 unsigned long bio_flags)
1839{
1840 int ret = 0;
1841 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1842 struct page *page = bvec->bv_page;
1843 struct extent_io_tree *tree = bio->bi_private;
1844 u64 start;
1845 u64 end;
1846
1847 start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
1848 end = start + bvec->bv_len - 1;
1849
1850 bio->bi_private = NULL;
1851
1852 bio_get(bio);
1853
1854 if (tree->ops && tree->ops->submit_bio_hook)
1855 tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
1856 mirror_num, bio_flags);
1857 else
1858 submit_bio(rw, bio);
1859 if (bio_flagged(bio, BIO_EOPNOTSUPP))
1860 ret = -EOPNOTSUPP;
1861 bio_put(bio);
1862 return ret;
1863}
1864
1865static int submit_extent_page(int rw, struct extent_io_tree *tree,
1866 struct page *page, sector_t sector,
1867 size_t size, unsigned long offset,
1868 struct block_device *bdev,
1869 struct bio **bio_ret,
1870 unsigned long max_pages,
1871 bio_end_io_t end_io_func,
1872 int mirror_num,
1873 unsigned long prev_bio_flags,
1874 unsigned long bio_flags)
1875{
1876 int ret = 0;
1877 struct bio *bio;
1878 int nr;
1879 int contig = 0;
1880 int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED;
1881 int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
1882 size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE);
1883
1884 if (bio_ret && *bio_ret) {
1885 bio = *bio_ret;
1886 if (old_compressed)
1887 contig = bio->bi_sector == sector;
1888 else
1889 contig = bio->bi_sector + (bio->bi_size >> 9) ==
1890 sector;
1891
1892 if (prev_bio_flags != bio_flags || !contig ||
1893 (tree->ops && tree->ops->merge_bio_hook &&
1894 tree->ops->merge_bio_hook(page, offset, page_size, bio,
1895 bio_flags)) ||
1896 bio_add_page(bio, page, page_size, offset) < page_size) {
1897 ret = submit_one_bio(rw, bio, mirror_num,
1898 prev_bio_flags);
1899 bio = NULL;
1900 } else {
1901 return 0;
1902 }
1903 }
1904 if (this_compressed)
1905 nr = BIO_MAX_PAGES;
1906 else
1907 nr = bio_get_nr_vecs(bdev);
1908
1909 bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
1910
1911 bio_add_page(bio, page, page_size, offset);
1912 bio->bi_end_io = end_io_func;
1913 bio->bi_private = tree;
1914
1915 if (bio_ret)
1916 *bio_ret = bio;
1917 else
1918 ret = submit_one_bio(rw, bio, mirror_num, bio_flags);
1919
1920 return ret;
1921}
1922
1923void set_page_extent_mapped(struct page *page)
1924{
1925 if (!PagePrivate(page)) {
1926 SetPagePrivate(page);
1927 page_cache_get(page);
1928 set_page_private(page, EXTENT_PAGE_PRIVATE);
1929 }
1930}
1931
1932static void set_page_extent_head(struct page *page, unsigned long len)
1933{
1934 set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
1935}
1936
1937/*
1938 * basic readpage implementation. Locked extent state structs are inserted
1939 * into the tree that are removed when the IO is done (by the end_io
1940 * handlers)
1941 */
1942static int __extent_read_full_page(struct extent_io_tree *tree,
1943 struct page *page,
1944 get_extent_t *get_extent,
1945 struct bio **bio, int mirror_num,
1946 unsigned long *bio_flags)
1947{
1948 struct inode *inode = page->mapping->host;
1949 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1950 u64 page_end = start + PAGE_CACHE_SIZE - 1;
1951 u64 end;
1952 u64 cur = start;
1953 u64 extent_offset;
1954 u64 last_byte = i_size_read(inode);
1955 u64 block_start;
1956 u64 cur_end;
1957 sector_t sector;
1958 struct extent_map *em;
1959 struct block_device *bdev;
1960 int ret;
1961 int nr = 0;
1962 size_t page_offset = 0;
1963 size_t iosize;
1964 size_t disk_io_size;
1965 size_t blocksize = inode->i_sb->s_blocksize;
1966 unsigned long this_bio_flag = 0;
1967
1968 set_page_extent_mapped(page);
1969
1970 end = page_end;
1971 lock_extent(tree, start, end, GFP_NOFS);
1972
1973 if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
1974 char *userpage;
1975 size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1);
1976
1977 if (zero_offset) {
1978 iosize = PAGE_CACHE_SIZE - zero_offset;
1979 userpage = kmap_atomic(page, KM_USER0);
1980 memset(userpage + zero_offset, 0, iosize);
1981 flush_dcache_page(page);
1982 kunmap_atomic(userpage, KM_USER0);
1983 }
1984 }
1985 while (cur <= end) {
1986 if (cur >= last_byte) {
1987 char *userpage;
1988 iosize = PAGE_CACHE_SIZE - page_offset;
1989 userpage = kmap_atomic(page, KM_USER0);
1990 memset(userpage + page_offset, 0, iosize);
1991 flush_dcache_page(page);
1992 kunmap_atomic(userpage, KM_USER0);
1993 set_extent_uptodate(tree, cur, cur + iosize - 1,
1994 GFP_NOFS);
1995 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1996 break;
1997 }
1998 em = get_extent(inode, page, page_offset, cur,
1999 end - cur + 1, 0);
2000 if (IS_ERR(em) || !em) {
2001 SetPageError(page);
2002 unlock_extent(tree, cur, end, GFP_NOFS);
2003 break;
2004 }
2005 extent_offset = cur - em->start;
2006 BUG_ON(extent_map_end(em) <= cur);
2007 BUG_ON(end < cur);
2008
2009 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
2010 this_bio_flag = EXTENT_BIO_COMPRESSED;
2011
2012 iosize = min(extent_map_end(em) - cur, end - cur + 1);
2013 cur_end = min(extent_map_end(em) - 1, end);
2014 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
2015 if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
2016 disk_io_size = em->block_len;
2017 sector = em->block_start >> 9;
2018 } else {
2019 sector = (em->block_start + extent_offset) >> 9;
2020 disk_io_size = iosize;
2021 }
2022 bdev = em->bdev;
2023 block_start = em->block_start;
2024 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
2025 block_start = EXTENT_MAP_HOLE;
2026 free_extent_map(em);
2027 em = NULL;
2028
2029 /* we've found a hole, just zero and go on */
2030 if (block_start == EXTENT_MAP_HOLE) {
2031 char *userpage;
2032 userpage = kmap_atomic(page, KM_USER0);
2033 memset(userpage + page_offset, 0, iosize);
2034 flush_dcache_page(page);
2035 kunmap_atomic(userpage, KM_USER0);
2036
2037 set_extent_uptodate(tree, cur, cur + iosize - 1,
2038 GFP_NOFS);
2039 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2040 cur = cur + iosize;
2041 page_offset += iosize;
2042 continue;
2043 }
2044 /* the get_extent function already copied into the page */
2045 if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) {
2046 check_page_uptodate(tree, page);
2047 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2048 cur = cur + iosize;
2049 page_offset += iosize;
2050 continue;
2051 }
2052 /* we have an inline extent but it didn't get marked up
2053 * to date. Error out
2054 */
2055 if (block_start == EXTENT_MAP_INLINE) {
2056 SetPageError(page);
2057 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2058 cur = cur + iosize;
2059 page_offset += iosize;
2060 continue;
2061 }
2062
2063 ret = 0;
2064 if (tree->ops && tree->ops->readpage_io_hook) {
2065 ret = tree->ops->readpage_io_hook(page, cur,
2066 cur + iosize - 1);
2067 }
2068 if (!ret) {
2069 unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
2070 pnr -= page->index;
2071 ret = submit_extent_page(READ, tree, page,
2072 sector, disk_io_size, page_offset,
2073 bdev, bio, pnr,
2074 end_bio_extent_readpage, mirror_num,
2075 *bio_flags,
2076 this_bio_flag);
2077 nr++;
2078 *bio_flags = this_bio_flag;
2079 }
2080 if (ret)
2081 SetPageError(page);
2082 cur = cur + iosize;
2083 page_offset += iosize;
2084 }
2085 if (!nr) {
2086 if (!PageError(page))
2087 SetPageUptodate(page);
2088 unlock_page(page);
2089 }
2090 return 0;
2091}
2092
2093int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
2094 get_extent_t *get_extent)
2095{
2096 struct bio *bio = NULL;
2097 unsigned long bio_flags = 0;
2098 int ret;
2099
2100 ret = __extent_read_full_page(tree, page, get_extent, &bio, 0,
2101 &bio_flags);
2102 if (bio)
2103 submit_one_bio(READ, bio, 0, bio_flags);
2104 return ret;
2105}
2106
2107/*
2108 * the writepage semantics are similar to regular writepage. extent
2109 * records are inserted to lock ranges in the tree, and as dirty areas
2110 * are found, they are marked writeback. Then the lock bits are removed
2111 * and the end_io handler clears the writeback ranges
2112 */
2113static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2114 void *data)
2115{
2116 struct inode *inode = page->mapping->host;
2117 struct extent_page_data *epd = data;
2118 struct extent_io_tree *tree = epd->tree;
2119 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2120 u64 delalloc_start;
2121 u64 page_end = start + PAGE_CACHE_SIZE - 1;
2122 u64 end;
2123 u64 cur = start;
2124 u64 extent_offset;
2125 u64 last_byte = i_size_read(inode);
2126 u64 block_start;
2127 u64 iosize;
2128 u64 unlock_start;
2129 sector_t sector;
2130 struct extent_map *em;
2131 struct block_device *bdev;
2132 int ret;
2133 int nr = 0;
2134 size_t pg_offset = 0;
2135 size_t blocksize;
2136 loff_t i_size = i_size_read(inode);
2137 unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
2138 u64 nr_delalloc;
2139 u64 delalloc_end;
2140 int page_started;
2141 int compressed;
2142 unsigned long nr_written = 0;
2143
2144 WARN_ON(!PageLocked(page));
2145 pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
2146 if (page->index > end_index ||
2147 (page->index == end_index && !pg_offset)) {
2148 page->mapping->a_ops->invalidatepage(page, 0);
2149 unlock_page(page);
2150 return 0;
2151 }
2152
2153 if (page->index == end_index) {
2154 char *userpage;
2155
2156 userpage = kmap_atomic(page, KM_USER0);
2157 memset(userpage + pg_offset, 0,
2158 PAGE_CACHE_SIZE - pg_offset);
2159 kunmap_atomic(userpage, KM_USER0);
2160 flush_dcache_page(page);
2161 }
2162 pg_offset = 0;
2163
2164 set_page_extent_mapped(page);
2165
2166 delalloc_start = start;
2167 delalloc_end = 0;
2168 page_started = 0;
2169 if (!epd->extent_locked) {
2170 while (delalloc_end < page_end) {
2171 nr_delalloc = find_lock_delalloc_range(inode, tree,
2172 page,
2173 &delalloc_start,
2174 &delalloc_end,
2175 128 * 1024 * 1024);
2176 if (nr_delalloc == 0) {
2177 delalloc_start = delalloc_end + 1;
2178 continue;
2179 }
2180 tree->ops->fill_delalloc(inode, page, delalloc_start,
2181 delalloc_end, &page_started,
2182 &nr_written);
2183 delalloc_start = delalloc_end + 1;
2184 }
2185
2186 /* did the fill delalloc function already unlock and start
2187 * the IO?
2188 */
2189 if (page_started) {
2190 ret = 0;
2191 goto update_nr_written;
2192 }
2193 }
2194 lock_extent(tree, start, page_end, GFP_NOFS);
2195
2196 unlock_start = start;
2197
2198 if (tree->ops && tree->ops->writepage_start_hook) {
2199 ret = tree->ops->writepage_start_hook(page, start,
2200 page_end);
2201 if (ret == -EAGAIN) {
2202 unlock_extent(tree, start, page_end, GFP_NOFS);
2203 redirty_page_for_writepage(wbc, page);
2204 unlock_page(page);
2205 ret = 0;
2206 goto update_nr_written;
2207 }
2208 }
2209
2210 nr_written++;
2211
2212 end = page_end;
2213 if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0))
2214 printk(KERN_ERR "btrfs delalloc bits after lock_extent\n");
2215
2216 if (last_byte <= start) {
2217 clear_extent_dirty(tree, start, page_end, GFP_NOFS);
2218 unlock_extent(tree, start, page_end, GFP_NOFS);
2219 if (tree->ops && tree->ops->writepage_end_io_hook)
2220 tree->ops->writepage_end_io_hook(page, start,
2221 page_end, NULL, 1);
2222 unlock_start = page_end + 1;
2223 goto done;
2224 }
2225
2226 set_extent_uptodate(tree, start, page_end, GFP_NOFS);
2227 blocksize = inode->i_sb->s_blocksize;
2228
2229 while (cur <= end) {
2230 if (cur >= last_byte) {
2231 clear_extent_dirty(tree, cur, page_end, GFP_NOFS);
2232 unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
2233 if (tree->ops && tree->ops->writepage_end_io_hook)
2234 tree->ops->writepage_end_io_hook(page, cur,
2235 page_end, NULL, 1);
2236 unlock_start = page_end + 1;
2237 break;
2238 }
2239 em = epd->get_extent(inode, page, pg_offset, cur,
2240 end - cur + 1, 1);
2241 if (IS_ERR(em) || !em) {
2242 SetPageError(page);
2243 break;
2244 }
2245
2246 extent_offset = cur - em->start;
2247 BUG_ON(extent_map_end(em) <= cur);
2248 BUG_ON(end < cur);
2249 iosize = min(extent_map_end(em) - cur, end - cur + 1);
2250 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
2251 sector = (em->block_start + extent_offset) >> 9;
2252 bdev = em->bdev;
2253 block_start = em->block_start;
2254 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
2255 free_extent_map(em);
2256 em = NULL;
2257
2258 /*
2259 * compressed and inline extents are written through other
2260 * paths in the FS
2261 */
2262 if (compressed || block_start == EXTENT_MAP_HOLE ||
2263 block_start == EXTENT_MAP_INLINE) {
2264 clear_extent_dirty(tree, cur,
2265 cur + iosize - 1, GFP_NOFS);
2266
2267 unlock_extent(tree, unlock_start, cur + iosize - 1,
2268 GFP_NOFS);
2269
2270 /*
2271 * end_io notification does not happen here for
2272 * compressed extents
2273 */
2274 if (!compressed && tree->ops &&
2275 tree->ops->writepage_end_io_hook)
2276 tree->ops->writepage_end_io_hook(page, cur,
2277 cur + iosize - 1,
2278 NULL, 1);
2279 else if (compressed) {
2280 /* we don't want to end_page_writeback on
2281 * a compressed extent. this happens
2282 * elsewhere
2283 */
2284 nr++;
2285 }
2286
2287 cur += iosize;
2288 pg_offset += iosize;
2289 unlock_start = cur;
2290 continue;
2291 }
2292 /* leave this out until we have a page_mkwrite call */
2293 if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
2294 EXTENT_DIRTY, 0)) {
2295 cur = cur + iosize;
2296 pg_offset += iosize;
2297 continue;
2298 }
2299
2300 clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
2301 if (tree->ops && tree->ops->writepage_io_hook) {
2302 ret = tree->ops->writepage_io_hook(page, cur,
2303 cur + iosize - 1);
2304 } else {
2305 ret = 0;
2306 }
2307 if (ret) {
2308 SetPageError(page);
2309 } else {
2310 unsigned long max_nr = end_index + 1;
2311
2312 set_range_writeback(tree, cur, cur + iosize - 1);
2313 if (!PageWriteback(page)) {
2314 printk(KERN_ERR "btrfs warning page %lu not "
2315 "writeback, cur %llu end %llu\n",
2316 page->index, (unsigned long long)cur,
2317 (unsigned long long)end);
2318 }
2319
2320 ret = submit_extent_page(WRITE, tree, page, sector,
2321 iosize, pg_offset, bdev,
2322 &epd->bio, max_nr,
2323 end_bio_extent_writepage,
2324 0, 0, 0);
2325 if (ret)
2326 SetPageError(page);
2327 }
2328 cur = cur + iosize;
2329 pg_offset += iosize;
2330 nr++;
2331 }
2332done:
2333 if (nr == 0) {
2334 /* make sure the mapping tag for page dirty gets cleared */
2335 set_page_writeback(page);
2336 end_page_writeback(page);
2337 }
2338 if (unlock_start <= page_end)
2339 unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
2340 unlock_page(page);
2341
2342update_nr_written:
2343 wbc->nr_to_write -= nr_written;
2344 if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
2345 wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
2346 page->mapping->writeback_index = page->index + nr_written;
2347 return 0;
2348}
2349
2350/**
2351 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
2352 * @mapping: address space structure to write
2353 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
2354 * @writepage: function called for each page
2355 * @data: data passed to writepage function
2356 *
2357 * If a page is already under I/O, write_cache_pages() skips it, even
2358 * if it's dirty. This is desirable behaviour for memory-cleaning writeback,
2359 * but it is INCORRECT for data-integrity system calls such as fsync(). fsync()
2360 * and msync() need to guarantee that all the data which was dirty at the time
2361 * the call was made get new I/O started against them. If wbc->sync_mode is
2362 * WB_SYNC_ALL then we were called for data integrity and we must wait for
2363 * existing IO to complete.
2364 */
2365static int extent_write_cache_pages(struct extent_io_tree *tree,
2366 struct address_space *mapping,
2367 struct writeback_control *wbc,
2368 writepage_t writepage, void *data,
2369 void (*flush_fn)(void *))
2370{
2371 struct backing_dev_info *bdi = mapping->backing_dev_info;
2372 int ret = 0;
2373 int done = 0;
2374 struct pagevec pvec;
2375 int nr_pages;
2376 pgoff_t index;
2377 pgoff_t end; /* Inclusive */
2378 int scanned = 0;
2379 int range_whole = 0;
2380
2381 if (wbc->nonblocking && bdi_write_congested(bdi)) {
2382 wbc->encountered_congestion = 1;
2383 return 0;
2384 }
2385
2386 pagevec_init(&pvec, 0);
2387 if (wbc->range_cyclic) {
2388 index = mapping->writeback_index; /* Start from prev offset */
2389 end = -1;
2390 } else {
2391 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2392 end = wbc->range_end >> PAGE_CACHE_SHIFT;
2393 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2394 range_whole = 1;
2395 scanned = 1;
2396 }
2397retry:
2398 while (!done && (index <= end) &&
2399 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
2400 PAGECACHE_TAG_DIRTY, min(end - index,
2401 (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
2402 unsigned i;
2403
2404 scanned = 1;
2405 for (i = 0; i < nr_pages; i++) {
2406 struct page *page = pvec.pages[i];
2407
2408 /*
2409 * At this point we hold neither mapping->tree_lock nor
2410 * lock on the page itself: the page may be truncated or
2411 * invalidated (changing page->mapping to NULL), or even
2412 * swizzled back from swapper_space to tmpfs file
2413 * mapping
2414 */
2415 if (tree->ops && tree->ops->write_cache_pages_lock_hook)
2416 tree->ops->write_cache_pages_lock_hook(page);
2417 else
2418 lock_page(page);
2419
2420 if (unlikely(page->mapping != mapping)) {
2421 unlock_page(page);
2422 continue;
2423 }
2424
2425 if (!wbc->range_cyclic && page->index > end) {
2426 done = 1;
2427 unlock_page(page);
2428 continue;
2429 }
2430
2431 if (wbc->sync_mode != WB_SYNC_NONE) {
2432 if (PageWriteback(page))
2433 flush_fn(data);
2434 wait_on_page_writeback(page);
2435 }
2436
2437 if (PageWriteback(page) ||
2438 !clear_page_dirty_for_io(page)) {
2439 unlock_page(page);
2440 continue;
2441 }
2442
2443 ret = (*writepage)(page, wbc, data);
2444
2445 if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
2446 unlock_page(page);
2447 ret = 0;
2448 }
2449 if (ret || wbc->nr_to_write <= 0)
2450 done = 1;
2451 if (wbc->nonblocking && bdi_write_congested(bdi)) {
2452 wbc->encountered_congestion = 1;
2453 done = 1;
2454 }
2455 }
2456 pagevec_release(&pvec);
2457 cond_resched();
2458 }
2459 if (!scanned && !done) {
2460 /*
2461 * We hit the last page and there is more work to be done: wrap
2462 * back to the start of the file
2463 */
2464 scanned = 1;
2465 index = 0;
2466 goto retry;
2467 }
2468 return ret;
2469}
2470
2471static noinline void flush_write_bio(void *data)
2472{
2473 struct extent_page_data *epd = data;
2474 if (epd->bio) {
2475 submit_one_bio(WRITE, epd->bio, 0, 0);
2476 epd->bio = NULL;
2477 }
2478}
2479
2480int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
2481 get_extent_t *get_extent,
2482 struct writeback_control *wbc)
2483{
2484 int ret;
2485 struct address_space *mapping = page->mapping;
2486 struct extent_page_data epd = {
2487 .bio = NULL,
2488 .tree = tree,
2489 .get_extent = get_extent,
2490 .extent_locked = 0,
2491 };
2492 struct writeback_control wbc_writepages = {
2493 .bdi = wbc->bdi,
2494 .sync_mode = WB_SYNC_NONE,
2495 .older_than_this = NULL,
2496 .nr_to_write = 64,
2497 .range_start = page_offset(page) + PAGE_CACHE_SIZE,
2498 .range_end = (loff_t)-1,
2499 };
2500
2501
2502 ret = __extent_writepage(page, wbc, &epd);
2503
2504 extent_write_cache_pages(tree, mapping, &wbc_writepages,
2505 __extent_writepage, &epd, flush_write_bio);
2506 if (epd.bio)
2507 submit_one_bio(WRITE, epd.bio, 0, 0);
2508 return ret;
2509}
2510
2511int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
2512 u64 start, u64 end, get_extent_t *get_extent,
2513 int mode)
2514{
2515 int ret = 0;
2516 struct address_space *mapping = inode->i_mapping;
2517 struct page *page;
2518 unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >>
2519 PAGE_CACHE_SHIFT;
2520
2521 struct extent_page_data epd = {
2522 .bio = NULL,
2523 .tree = tree,
2524 .get_extent = get_extent,
2525 .extent_locked = 1,
2526 };
2527 struct writeback_control wbc_writepages = {
2528 .bdi = inode->i_mapping->backing_dev_info,
2529 .sync_mode = mode,
2530 .older_than_this = NULL,
2531 .nr_to_write = nr_pages * 2,
2532 .range_start = start,
2533 .range_end = end + 1,
2534 };
2535
2536 while (start <= end) {
2537 page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
2538 if (clear_page_dirty_for_io(page))
2539 ret = __extent_writepage(page, &wbc_writepages, &epd);
2540 else {
2541 if (tree->ops && tree->ops->writepage_end_io_hook)
2542 tree->ops->writepage_end_io_hook(page, start,
2543 start + PAGE_CACHE_SIZE - 1,
2544 NULL, 1);
2545 unlock_page(page);
2546 }
2547 page_cache_release(page);
2548 start += PAGE_CACHE_SIZE;
2549 }
2550
2551 if (epd.bio)
2552 submit_one_bio(WRITE, epd.bio, 0, 0);
2553 return ret;
2554}
2555
2556int extent_writepages(struct extent_io_tree *tree,
2557 struct address_space *mapping,
2558 get_extent_t *get_extent,
2559 struct writeback_control *wbc)
2560{
2561 int ret = 0;
2562 struct extent_page_data epd = {
2563 .bio = NULL,
2564 .tree = tree,
2565 .get_extent = get_extent,
2566 .extent_locked = 0,
2567 };
2568
2569 ret = extent_write_cache_pages(tree, mapping, wbc,
2570 __extent_writepage, &epd,
2571 flush_write_bio);
2572 if (epd.bio)
2573 submit_one_bio(WRITE, epd.bio, 0, 0);
2574 return ret;
2575}
2576
2577int extent_readpages(struct extent_io_tree *tree,
2578 struct address_space *mapping,
2579 struct list_head *pages, unsigned nr_pages,
2580 get_extent_t get_extent)
2581{
2582 struct bio *bio = NULL;
2583 unsigned page_idx;
2584 struct pagevec pvec;
2585 unsigned long bio_flags = 0;
2586
2587 pagevec_init(&pvec, 0);
2588 for (page_idx = 0; page_idx < nr_pages; page_idx++) {
2589 struct page *page = list_entry(pages->prev, struct page, lru);
2590
2591 prefetchw(&page->flags);
2592 list_del(&page->lru);
2593 /*
2594 * what we want to do here is call add_to_page_cache_lru,
2595 * but that isn't exported, so we reproduce it here
2596 */
2597 if (!add_to_page_cache(page, mapping,
2598 page->index, GFP_KERNEL)) {
2599
2600 /* open coding of lru_cache_add, also not exported */
2601 page_cache_get(page);
2602 if (!pagevec_add(&pvec, page))
2603 __pagevec_lru_add_file(&pvec);
2604 __extent_read_full_page(tree, page, get_extent,
2605 &bio, 0, &bio_flags);
2606 }
2607 page_cache_release(page);
2608 }
2609 if (pagevec_count(&pvec))
2610 __pagevec_lru_add_file(&pvec);
2611 BUG_ON(!list_empty(pages));
2612 if (bio)
2613 submit_one_bio(READ, bio, 0, bio_flags);
2614 return 0;
2615}
2616
2617/*
2618 * basic invalidatepage code, this waits on any locked or writeback
2619 * ranges corresponding to the page, and then deletes any extent state
2620 * records from the tree
2621 */
2622int extent_invalidatepage(struct extent_io_tree *tree,
2623 struct page *page, unsigned long offset)
2624{
2625 u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
2626 u64 end = start + PAGE_CACHE_SIZE - 1;
2627 size_t blocksize = page->mapping->host->i_sb->s_blocksize;
2628
2629 start += (offset + blocksize - 1) & ~(blocksize - 1);
2630 if (start > end)
2631 return 0;
2632
2633 lock_extent(tree, start, end, GFP_NOFS);
2634 wait_on_extent_writeback(tree, start, end);
2635 clear_extent_bit(tree, start, end,
2636 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
2637 1, 1, GFP_NOFS);
2638 return 0;
2639}
2640
2641/*
2642 * simple commit_write call, set_range_dirty is used to mark both
2643 * the pages and the extent records as dirty
2644 */
2645int extent_commit_write(struct extent_io_tree *tree,
2646 struct inode *inode, struct page *page,
2647 unsigned from, unsigned to)
2648{
2649 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2650
2651 set_page_extent_mapped(page);
2652 set_page_dirty(page);
2653
2654 if (pos > inode->i_size) {
2655 i_size_write(inode, pos);
2656 mark_inode_dirty(inode);
2657 }
2658 return 0;
2659}
2660
2661int extent_prepare_write(struct extent_io_tree *tree,
2662 struct inode *inode, struct page *page,
2663 unsigned from, unsigned to, get_extent_t *get_extent)
2664{
2665 u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
2666 u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
2667 u64 block_start;
2668 u64 orig_block_start;
2669 u64 block_end;
2670 u64 cur_end;
2671 struct extent_map *em;
2672 unsigned blocksize = 1 << inode->i_blkbits;
2673 size_t page_offset = 0;
2674 size_t block_off_start;
2675 size_t block_off_end;
2676 int err = 0;
2677 int iocount = 0;
2678 int ret = 0;
2679 int isnew;
2680
2681 set_page_extent_mapped(page);
2682
2683 block_start = (page_start + from) & ~((u64)blocksize - 1);
2684 block_end = (page_start + to - 1) | (blocksize - 1);
2685 orig_block_start = block_start;
2686
2687 lock_extent(tree, page_start, page_end, GFP_NOFS);
2688 while (block_start <= block_end) {
2689 em = get_extent(inode, page, page_offset, block_start,
2690 block_end - block_start + 1, 1);
2691 if (IS_ERR(em) || !em)
2692 goto err;
2693
2694 cur_end = min(block_end, extent_map_end(em) - 1);
2695 block_off_start = block_start & (PAGE_CACHE_SIZE - 1);
2696 block_off_end = block_off_start + blocksize;
2697 isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS);
2698
2699 if (!PageUptodate(page) && isnew &&
2700 (block_off_end > to || block_off_start < from)) {
2701 void *kaddr;
2702
2703 kaddr = kmap_atomic(page, KM_USER0);
2704 if (block_off_end > to)
2705 memset(kaddr + to, 0, block_off_end - to);
2706 if (block_off_start < from)
2707 memset(kaddr + block_off_start, 0,
2708 from - block_off_start);
2709 flush_dcache_page(page);
2710 kunmap_atomic(kaddr, KM_USER0);
2711 }
2712 if ((em->block_start != EXTENT_MAP_HOLE &&
2713 em->block_start != EXTENT_MAP_INLINE) &&
2714 !isnew && !PageUptodate(page) &&
2715 (block_off_end > to || block_off_start < from) &&
2716 !test_range_bit(tree, block_start, cur_end,
2717 EXTENT_UPTODATE, 1)) {
2718 u64 sector;
2719 u64 extent_offset = block_start - em->start;
2720 size_t iosize;
2721 sector = (em->block_start + extent_offset) >> 9;
2722 iosize = (cur_end - block_start + blocksize) &
2723 ~((u64)blocksize - 1);
2724 /*
2725 * we've already got the extent locked, but we
2726 * need to split the state such that our end_bio
2727 * handler can clear the lock.
2728 */
2729 set_extent_bit(tree, block_start,
2730 block_start + iosize - 1,
2731 EXTENT_LOCKED, 0, NULL, GFP_NOFS);
2732 ret = submit_extent_page(READ, tree, page,
2733 sector, iosize, page_offset, em->bdev,
2734 NULL, 1,
2735 end_bio_extent_preparewrite, 0,
2736 0, 0);
2737 iocount++;
2738 block_start = block_start + iosize;
2739 } else {
2740 set_extent_uptodate(tree, block_start, cur_end,
2741 GFP_NOFS);
2742 unlock_extent(tree, block_start, cur_end, GFP_NOFS);
2743 block_start = cur_end + 1;
2744 }
2745 page_offset = block_start & (PAGE_CACHE_SIZE - 1);
2746 free_extent_map(em);
2747 }
2748 if (iocount) {
2749 wait_extent_bit(tree, orig_block_start,
2750 block_end, EXTENT_LOCKED);
2751 }
2752 check_page_uptodate(tree, page);
2753err:
2754 /* FIXME, zero out newly allocated blocks on error */
2755 return err;
2756}
2757
2758/*
2759 * a helper for releasepage, this tests for areas of the page that
2760 * are locked or under IO and drops the related state bits if it is safe
2761 * to drop the page.
2762 */
2763int try_release_extent_state(struct extent_map_tree *map,
2764 struct extent_io_tree *tree, struct page *page,
2765 gfp_t mask)
2766{
2767 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2768 u64 end = start + PAGE_CACHE_SIZE - 1;
2769 int ret = 1;
2770
2771 if (test_range_bit(tree, start, end,
2772 EXTENT_IOBITS | EXTENT_ORDERED, 0))
2773 ret = 0;
2774 else {
2775 if ((mask & GFP_NOFS) == GFP_NOFS)
2776 mask = GFP_NOFS;
2777 clear_extent_bit(tree, start, end, EXTENT_UPTODATE,
2778 1, 1, mask);
2779 }
2780 return ret;
2781}
2782
2783/*
2784 * a helper for releasepage. As long as there are no locked extents
2785 * in the range corresponding to the page, both state records and extent
2786 * map records are removed
2787 */
2788int try_release_extent_mapping(struct extent_map_tree *map,
2789 struct extent_io_tree *tree, struct page *page,
2790 gfp_t mask)
2791{
2792 struct extent_map *em;
2793 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2794 u64 end = start + PAGE_CACHE_SIZE - 1;
2795
2796 if ((mask & __GFP_WAIT) &&
2797 page->mapping->host->i_size > 16 * 1024 * 1024) {
2798 u64 len;
2799 while (start <= end) {
2800 len = end - start + 1;
2801 spin_lock(&map->lock);
2802 em = lookup_extent_mapping(map, start, len);
2803 if (!em || IS_ERR(em)) {
2804 spin_unlock(&map->lock);
2805 break;
2806 }
2807 if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
2808 em->start != start) {
2809 spin_unlock(&map->lock);
2810 free_extent_map(em);
2811 break;
2812 }
2813 if (!test_range_bit(tree, em->start,
2814 extent_map_end(em) - 1,
2815 EXTENT_LOCKED | EXTENT_WRITEBACK |
2816 EXTENT_ORDERED,
2817 0)) {
2818 remove_extent_mapping(map, em);
2819 /* once for the rb tree */
2820 free_extent_map(em);
2821 }
2822 start = extent_map_end(em);
2823 spin_unlock(&map->lock);
2824
2825 /* once for us */
2826 free_extent_map(em);
2827 }
2828 }
2829 return try_release_extent_state(map, tree, page, mask);
2830}
2831
2832sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
2833 get_extent_t *get_extent)
2834{
2835 struct inode *inode = mapping->host;
2836 u64 start = iblock << inode->i_blkbits;
2837 sector_t sector = 0;
2838 size_t blksize = (1 << inode->i_blkbits);
2839 struct extent_map *em;
2840
2841 lock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
2842 GFP_NOFS);
2843 em = get_extent(inode, NULL, 0, start, blksize, 0);
2844 unlock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
2845 GFP_NOFS);
2846 if (!em || IS_ERR(em))
2847 return 0;
2848
2849 if (em->block_start > EXTENT_MAP_LAST_BYTE)
2850 goto out;
2851
2852 sector = (em->block_start + start - em->start) >> inode->i_blkbits;
2853out:
2854 free_extent_map(em);
2855 return sector;
2856}
2857
2858static inline struct page *extent_buffer_page(struct extent_buffer *eb,
2859 unsigned long i)
2860{
2861 struct page *p;
2862 struct address_space *mapping;
2863
2864 if (i == 0)
2865 return eb->first_page;
2866 i += eb->start >> PAGE_CACHE_SHIFT;
2867 mapping = eb->first_page->mapping;
2868 if (!mapping)
2869 return NULL;
2870
2871 /*
2872 * extent_buffer_page is only called after pinning the page
2873 * by increasing the reference count. So we know the page must
2874 * be in the radix tree.
2875 */
2876 rcu_read_lock();
2877 p = radix_tree_lookup(&mapping->page_tree, i);
2878 rcu_read_unlock();
2879
2880 return p;
2881}
2882
2883static inline unsigned long num_extent_pages(u64 start, u64 len)
2884{
2885 return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
2886 (start >> PAGE_CACHE_SHIFT);
2887}
2888
2889static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
2890 u64 start,
2891 unsigned long len,
2892 gfp_t mask)
2893{
2894 struct extent_buffer *eb = NULL;
2895#ifdef LEAK_DEBUG
2896 unsigned long flags;
2897#endif
2898
2899 eb = kmem_cache_zalloc(extent_buffer_cache, mask);
2900 eb->start = start;
2901 eb->len = len;
2902 mutex_init(&eb->mutex);
2903#ifdef LEAK_DEBUG
2904 spin_lock_irqsave(&leak_lock, flags);
2905 list_add(&eb->leak_list, &buffers);
2906 spin_unlock_irqrestore(&leak_lock, flags);
2907#endif
2908 atomic_set(&eb->refs, 1);
2909
2910 return eb;
2911}
2912
2913static void __free_extent_buffer(struct extent_buffer *eb)
2914{
2915#ifdef LEAK_DEBUG
2916 unsigned long flags;
2917 spin_lock_irqsave(&leak_lock, flags);
2918 list_del(&eb->leak_list);
2919 spin_unlock_irqrestore(&leak_lock, flags);
2920#endif
2921 kmem_cache_free(extent_buffer_cache, eb);
2922}
2923
2924struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
2925 u64 start, unsigned long len,
2926 struct page *page0,
2927 gfp_t mask)
2928{
2929 unsigned long num_pages = num_extent_pages(start, len);
2930 unsigned long i;
2931 unsigned long index = start >> PAGE_CACHE_SHIFT;
2932 struct extent_buffer *eb;
2933 struct extent_buffer *exists = NULL;
2934 struct page *p;
2935 struct address_space *mapping = tree->mapping;
2936 int uptodate = 1;
2937
2938 spin_lock(&tree->buffer_lock);
2939 eb = buffer_search(tree, start);
2940 if (eb) {
2941 atomic_inc(&eb->refs);
2942 spin_unlock(&tree->buffer_lock);
2943 mark_page_accessed(eb->first_page);
2944 return eb;
2945 }
2946 spin_unlock(&tree->buffer_lock);
2947
2948 eb = __alloc_extent_buffer(tree, start, len, mask);
2949 if (!eb)
2950 return NULL;
2951
2952 if (page0) {
2953 eb->first_page = page0;
2954 i = 1;
2955 index++;
2956 page_cache_get(page0);
2957 mark_page_accessed(page0);
2958 set_page_extent_mapped(page0);
2959 set_page_extent_head(page0, len);
2960 uptodate = PageUptodate(page0);
2961 } else {
2962 i = 0;
2963 }
2964 for (; i < num_pages; i++, index++) {
2965 p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM);
2966 if (!p) {
2967 WARN_ON(1);
2968 goto free_eb;
2969 }
2970 set_page_extent_mapped(p);
2971 mark_page_accessed(p);
2972 if (i == 0) {
2973 eb->first_page = p;
2974 set_page_extent_head(p, len);
2975 } else {
2976 set_page_private(p, EXTENT_PAGE_PRIVATE);
2977 }
2978 if (!PageUptodate(p))
2979 uptodate = 0;
2980 unlock_page(p);
2981 }
2982 if (uptodate)
2983 eb->flags |= EXTENT_UPTODATE;
2984 eb->flags |= EXTENT_BUFFER_FILLED;
2985
2986 spin_lock(&tree->buffer_lock);
2987 exists = buffer_tree_insert(tree, start, &eb->rb_node);
2988 if (exists) {
2989 /* add one reference for the caller */
2990 atomic_inc(&exists->refs);
2991 spin_unlock(&tree->buffer_lock);
2992 goto free_eb;
2993 }
2994 spin_unlock(&tree->buffer_lock);
2995
2996 /* add one reference for the tree */
2997 atomic_inc(&eb->refs);
2998 return eb;
2999
3000free_eb:
3001 if (!atomic_dec_and_test(&eb->refs))
3002 return exists;
3003 for (index = 1; index < i; index++)
3004 page_cache_release(extent_buffer_page(eb, index));
3005 page_cache_release(extent_buffer_page(eb, 0));
3006 __free_extent_buffer(eb);
3007 return exists;
3008}
3009
3010struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
3011 u64 start, unsigned long len,
3012 gfp_t mask)
3013{
3014 struct extent_buffer *eb;
3015
3016 spin_lock(&tree->buffer_lock);
3017 eb = buffer_search(tree, start);
3018 if (eb)
3019 atomic_inc(&eb->refs);
3020 spin_unlock(&tree->buffer_lock);
3021
3022 if (eb)
3023 mark_page_accessed(eb->first_page);
3024
3025 return eb;
3026}
3027
3028void free_extent_buffer(struct extent_buffer *eb)
3029{
3030 if (!eb)
3031 return;
3032
3033 if (!atomic_dec_and_test(&eb->refs))
3034 return;
3035
3036 WARN_ON(1);
3037}
3038
3039int clear_extent_buffer_dirty(struct extent_io_tree *tree,
3040 struct extent_buffer *eb)
3041{
3042 int set;
3043 unsigned long i;
3044 unsigned long num_pages;
3045 struct page *page;
3046
3047 u64 start = eb->start;
3048 u64 end = start + eb->len - 1;
3049
3050 set = clear_extent_dirty(tree, start, end, GFP_NOFS);
3051 num_pages = num_extent_pages(eb->start, eb->len);
3052
3053 for (i = 0; i < num_pages; i++) {
3054 page = extent_buffer_page(eb, i);
3055 if (!set && !PageDirty(page))
3056 continue;
3057
3058 lock_page(page);
3059 if (i == 0)
3060 set_page_extent_head(page, eb->len);
3061 else
3062 set_page_private(page, EXTENT_PAGE_PRIVATE);
3063
3064 /*
3065 * if we're on the last page or the first page and the
3066 * block isn't aligned on a page boundary, do extra checks
3067 * to make sure we don't clean page that is partially dirty
3068 */
3069 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
3070 ((i == num_pages - 1) &&
3071 ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
3072 start = (u64)page->index << PAGE_CACHE_SHIFT;
3073 end = start + PAGE_CACHE_SIZE - 1;
3074 if (test_range_bit(tree, start, end,
3075 EXTENT_DIRTY, 0)) {
3076 unlock_page(page);
3077 continue;
3078 }
3079 }
3080 clear_page_dirty_for_io(page);
3081 spin_lock_irq(&page->mapping->tree_lock);
3082 if (!PageDirty(page)) {
3083 radix_tree_tag_clear(&page->mapping->page_tree,
3084 page_index(page),
3085 PAGECACHE_TAG_DIRTY);
3086 }
3087 spin_unlock_irq(&page->mapping->tree_lock);
3088 unlock_page(page);
3089 }
3090 return 0;
3091}
3092
3093int wait_on_extent_buffer_writeback(struct extent_io_tree *tree,
3094 struct extent_buffer *eb)
3095{
3096 return wait_on_extent_writeback(tree, eb->start,
3097 eb->start + eb->len - 1);
3098}
3099
3100int set_extent_buffer_dirty(struct extent_io_tree *tree,
3101 struct extent_buffer *eb)
3102{
3103 unsigned long i;
3104 unsigned long num_pages;
3105
3106 num_pages = num_extent_pages(eb->start, eb->len);
3107 for (i = 0; i < num_pages; i++) {
3108 struct page *page = extent_buffer_page(eb, i);
3109 /* writepage may need to do something special for the
3110 * first page, we have to make sure page->private is
3111 * properly set. releasepage may drop page->private
3112 * on us if the page isn't already dirty.
3113 */
3114 lock_page(page);
3115 if (i == 0) {
3116 set_page_extent_head(page, eb->len);
3117 } else if (PagePrivate(page) &&
3118 page->private != EXTENT_PAGE_PRIVATE) {
3119 set_page_extent_mapped(page);
3120 }
3121 __set_page_dirty_nobuffers(extent_buffer_page(eb, i));
3122 set_extent_dirty(tree, page_offset(page),
3123 page_offset(page) + PAGE_CACHE_SIZE - 1,
3124 GFP_NOFS);
3125 unlock_page(page);
3126 }
3127 return 0;
3128}
3129
3130int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
3131 struct extent_buffer *eb)
3132{
3133 unsigned long i;
3134 struct page *page;
3135 unsigned long num_pages;
3136
3137 num_pages = num_extent_pages(eb->start, eb->len);
3138 eb->flags &= ~EXTENT_UPTODATE;
3139
3140 clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3141 GFP_NOFS);
3142 for (i = 0; i < num_pages; i++) {
3143 page = extent_buffer_page(eb, i);
3144 if (page)
3145 ClearPageUptodate(page);
3146 }
3147 return 0;
3148}
3149
3150int set_extent_buffer_uptodate(struct extent_io_tree *tree,
3151 struct extent_buffer *eb)
3152{
3153 unsigned long i;
3154 struct page *page;
3155 unsigned long num_pages;
3156
3157 num_pages = num_extent_pages(eb->start, eb->len);
3158
3159 set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3160 GFP_NOFS);
3161 for (i = 0; i < num_pages; i++) {
3162 page = extent_buffer_page(eb, i);
3163 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
3164 ((i == num_pages - 1) &&
3165 ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
3166 check_page_uptodate(tree, page);
3167 continue;
3168 }
3169 SetPageUptodate(page);
3170 }
3171 return 0;
3172}
3173
3174int extent_range_uptodate(struct extent_io_tree *tree,
3175 u64 start, u64 end)
3176{
3177 struct page *page;
3178 int ret;
3179 int pg_uptodate = 1;
3180 int uptodate;
3181 unsigned long index;
3182
3183 ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1);
3184 if (ret)
3185 return 1;
3186 while (start <= end) {
3187 index = start >> PAGE_CACHE_SHIFT;
3188 page = find_get_page(tree->mapping, index);
3189 uptodate = PageUptodate(page);
3190 page_cache_release(page);
3191 if (!uptodate) {
3192 pg_uptodate = 0;
3193 break;
3194 }
3195 start += PAGE_CACHE_SIZE;
3196 }
3197 return pg_uptodate;
3198}
3199
3200int extent_buffer_uptodate(struct extent_io_tree *tree,
3201 struct extent_buffer *eb)
3202{
3203 int ret = 0;
3204 unsigned long num_pages;
3205 unsigned long i;
3206 struct page *page;
3207 int pg_uptodate = 1;
3208
3209 if (eb->flags & EXTENT_UPTODATE)
3210 return 1;
3211
3212 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3213 EXTENT_UPTODATE, 1);
3214 if (ret)
3215 return ret;
3216
3217 num_pages = num_extent_pages(eb->start, eb->len);
3218 for (i = 0; i < num_pages; i++) {
3219 page = extent_buffer_page(eb, i);
3220 if (!PageUptodate(page)) {
3221 pg_uptodate = 0;
3222 break;
3223 }
3224 }
3225 return pg_uptodate;
3226}
3227
3228int read_extent_buffer_pages(struct extent_io_tree *tree,
3229 struct extent_buffer *eb,
3230 u64 start, int wait,
3231 get_extent_t *get_extent, int mirror_num)
3232{
3233 unsigned long i;
3234 unsigned long start_i;
3235 struct page *page;
3236 int err;
3237 int ret = 0;
3238 int locked_pages = 0;
3239 int all_uptodate = 1;
3240 int inc_all_pages = 0;
3241 unsigned long num_pages;
3242 struct bio *bio = NULL;
3243 unsigned long bio_flags = 0;
3244
3245 if (eb->flags & EXTENT_UPTODATE)
3246 return 0;
3247
3248 if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3249 EXTENT_UPTODATE, 1)) {
3250 return 0;
3251 }
3252
3253 if (start) {
3254 WARN_ON(start < eb->start);
3255 start_i = (start >> PAGE_CACHE_SHIFT) -
3256 (eb->start >> PAGE_CACHE_SHIFT);
3257 } else {
3258 start_i = 0;
3259 }
3260
3261 num_pages = num_extent_pages(eb->start, eb->len);
3262 for (i = start_i; i < num_pages; i++) {
3263 page = extent_buffer_page(eb, i);
3264 if (!wait) {
3265 if (!trylock_page(page))
3266 goto unlock_exit;
3267 } else {
3268 lock_page(page);
3269 }
3270 locked_pages++;
3271 if (!PageUptodate(page))
3272 all_uptodate = 0;
3273 }
3274 if (all_uptodate) {
3275 if (start_i == 0)
3276 eb->flags |= EXTENT_UPTODATE;
3277 goto unlock_exit;
3278 }
3279
3280 for (i = start_i; i < num_pages; i++) {
3281 page = extent_buffer_page(eb, i);
3282 if (inc_all_pages)
3283 page_cache_get(page);
3284 if (!PageUptodate(page)) {
3285 if (start_i == 0)
3286 inc_all_pages = 1;
3287 ClearPageError(page);
3288 err = __extent_read_full_page(tree, page,
3289 get_extent, &bio,
3290 mirror_num, &bio_flags);
3291 if (err)
3292 ret = err;
3293 } else {
3294 unlock_page(page);
3295 }
3296 }
3297
3298 if (bio)
3299 submit_one_bio(READ, bio, mirror_num, bio_flags);
3300
3301 if (ret || !wait)
3302 return ret;
3303
3304 for (i = start_i; i < num_pages; i++) {
3305 page = extent_buffer_page(eb, i);
3306 wait_on_page_locked(page);
3307 if (!PageUptodate(page))
3308 ret = -EIO;
3309 }
3310
3311 if (!ret)
3312 eb->flags |= EXTENT_UPTODATE;
3313 return ret;
3314
3315unlock_exit:
3316 i = start_i;
3317 while (locked_pages > 0) {
3318 page = extent_buffer_page(eb, i);
3319 i++;
3320 unlock_page(page);
3321 locked_pages--;
3322 }
3323 return ret;
3324}
3325
3326void read_extent_buffer(struct extent_buffer *eb, void *dstv,
3327 unsigned long start,
3328 unsigned long len)
3329{
3330 size_t cur;
3331 size_t offset;
3332 struct page *page;
3333 char *kaddr;
3334 char *dst = (char *)dstv;
3335 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3336 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3337
3338 WARN_ON(start > eb->len);
3339 WARN_ON(start + len > eb->start + eb->len);
3340
3341 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3342
3343 while (len > 0) {
3344 page = extent_buffer_page(eb, i);
3345
3346 cur = min(len, (PAGE_CACHE_SIZE - offset));
3347 kaddr = kmap_atomic(page, KM_USER1);
3348 memcpy(dst, kaddr + offset, cur);
3349 kunmap_atomic(kaddr, KM_USER1);
3350
3351 dst += cur;
3352 len -= cur;
3353 offset = 0;
3354 i++;
3355 }
3356}
3357
3358int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
3359 unsigned long min_len, char **token, char **map,
3360 unsigned long *map_start,
3361 unsigned long *map_len, int km)
3362{
3363 size_t offset = start & (PAGE_CACHE_SIZE - 1);
3364 char *kaddr;
3365 struct page *p;
3366 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3367 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3368 unsigned long end_i = (start_offset + start + min_len - 1) >>
3369 PAGE_CACHE_SHIFT;
3370
3371 if (i != end_i)
3372 return -EINVAL;
3373
3374 if (i == 0) {
3375 offset = start_offset;
3376 *map_start = 0;
3377 } else {
3378 offset = 0;
3379 *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
3380 }
3381
3382 if (start + min_len > eb->len) {
3383 printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
3384 "wanted %lu %lu\n", (unsigned long long)eb->start,
3385 eb->len, start, min_len);
3386 WARN_ON(1);
3387 }
3388
3389 p = extent_buffer_page(eb, i);
3390 kaddr = kmap_atomic(p, km);
3391 *token = kaddr;
3392 *map = kaddr + offset;
3393 *map_len = PAGE_CACHE_SIZE - offset;
3394 return 0;
3395}
3396
3397int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
3398 unsigned long min_len,
3399 char **token, char **map,
3400 unsigned long *map_start,
3401 unsigned long *map_len, int km)
3402{
3403 int err;
3404 int save = 0;
3405 if (eb->map_token) {
3406 unmap_extent_buffer(eb, eb->map_token, km);
3407 eb->map_token = NULL;
3408 save = 1;
3409 WARN_ON(!mutex_is_locked(&eb->mutex));
3410 }
3411 err = map_private_extent_buffer(eb, start, min_len, token, map,
3412 map_start, map_len, km);
3413 if (!err && save) {
3414 eb->map_token = *token;
3415 eb->kaddr = *map;
3416 eb->map_start = *map_start;
3417 eb->map_len = *map_len;
3418 }
3419 return err;
3420}
3421
3422void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km)
3423{
3424 kunmap_atomic(token, km);
3425}
3426
3427int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
3428 unsigned long start,
3429 unsigned long len)
3430{
3431 size_t cur;
3432 size_t offset;
3433 struct page *page;
3434 char *kaddr;
3435 char *ptr = (char *)ptrv;
3436 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3437 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3438 int ret = 0;
3439
3440 WARN_ON(start > eb->len);
3441 WARN_ON(start + len > eb->start + eb->len);
3442
3443 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3444
3445 while (len > 0) {
3446 page = extent_buffer_page(eb, i);
3447
3448 cur = min(len, (PAGE_CACHE_SIZE - offset));
3449
3450 kaddr = kmap_atomic(page, KM_USER0);
3451 ret = memcmp(ptr, kaddr + offset, cur);
3452 kunmap_atomic(kaddr, KM_USER0);
3453 if (ret)
3454 break;
3455
3456 ptr += cur;
3457 len -= cur;
3458 offset = 0;
3459 i++;
3460 }
3461 return ret;
3462}
3463
3464void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
3465 unsigned long start, unsigned long len)
3466{
3467 size_t cur;
3468 size_t offset;
3469 struct page *page;
3470 char *kaddr;
3471 char *src = (char *)srcv;
3472 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3473 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3474
3475 WARN_ON(start > eb->len);
3476 WARN_ON(start + len > eb->start + eb->len);
3477
3478 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3479
3480 while (len > 0) {
3481 page = extent_buffer_page(eb, i);
3482 WARN_ON(!PageUptodate(page));
3483
3484 cur = min(len, PAGE_CACHE_SIZE - offset);
3485 kaddr = kmap_atomic(page, KM_USER1);
3486 memcpy(kaddr + offset, src, cur);
3487 kunmap_atomic(kaddr, KM_USER1);
3488
3489 src += cur;
3490 len -= cur;
3491 offset = 0;
3492 i++;
3493 }
3494}
3495
3496void memset_extent_buffer(struct extent_buffer *eb, char c,
3497 unsigned long start, unsigned long len)
3498{
3499 size_t cur;
3500 size_t offset;
3501 struct page *page;
3502 char *kaddr;
3503 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3504 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3505
3506 WARN_ON(start > eb->len);
3507 WARN_ON(start + len > eb->start + eb->len);
3508
3509 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3510
3511 while (len > 0) {
3512 page = extent_buffer_page(eb, i);
3513 WARN_ON(!PageUptodate(page));
3514
3515 cur = min(len, PAGE_CACHE_SIZE - offset);
3516 kaddr = kmap_atomic(page, KM_USER0);
3517 memset(kaddr + offset, c, cur);
3518 kunmap_atomic(kaddr, KM_USER0);
3519
3520 len -= cur;
3521 offset = 0;
3522 i++;
3523 }
3524}
3525
3526void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
3527 unsigned long dst_offset, unsigned long src_offset,
3528 unsigned long len)
3529{
3530 u64 dst_len = dst->len;
3531 size_t cur;
3532 size_t offset;
3533 struct page *page;
3534 char *kaddr;
3535 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3536 unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
3537
3538 WARN_ON(src->len != dst_len);
3539
3540 offset = (start_offset + dst_offset) &
3541 ((unsigned long)PAGE_CACHE_SIZE - 1);
3542
3543 while (len > 0) {
3544 page = extent_buffer_page(dst, i);
3545 WARN_ON(!PageUptodate(page));
3546
3547 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
3548
3549 kaddr = kmap_atomic(page, KM_USER0);
3550 read_extent_buffer(src, kaddr + offset, src_offset, cur);
3551 kunmap_atomic(kaddr, KM_USER0);
3552
3553 src_offset += cur;
3554 len -= cur;
3555 offset = 0;
3556 i++;
3557 }
3558}
3559
3560static void move_pages(struct page *dst_page, struct page *src_page,
3561 unsigned long dst_off, unsigned long src_off,
3562 unsigned long len)
3563{
3564 char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
3565 if (dst_page == src_page) {
3566 memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
3567 } else {
3568 char *src_kaddr = kmap_atomic(src_page, KM_USER1);
3569 char *p = dst_kaddr + dst_off + len;
3570 char *s = src_kaddr + src_off + len;
3571
3572 while (len--)
3573 *--p = *--s;
3574
3575 kunmap_atomic(src_kaddr, KM_USER1);
3576 }
3577 kunmap_atomic(dst_kaddr, KM_USER0);
3578}
3579
3580static void copy_pages(struct page *dst_page, struct page *src_page,
3581 unsigned long dst_off, unsigned long src_off,
3582 unsigned long len)
3583{
3584 char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
3585 char *src_kaddr;
3586
3587 if (dst_page != src_page)
3588 src_kaddr = kmap_atomic(src_page, KM_USER1);
3589 else
3590 src_kaddr = dst_kaddr;
3591
3592 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
3593 kunmap_atomic(dst_kaddr, KM_USER0);
3594 if (dst_page != src_page)
3595 kunmap_atomic(src_kaddr, KM_USER1);
3596}
3597
3598void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
3599 unsigned long src_offset, unsigned long len)
3600{
3601 size_t cur;
3602 size_t dst_off_in_page;
3603 size_t src_off_in_page;
3604 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3605 unsigned long dst_i;
3606 unsigned long src_i;
3607
3608 if (src_offset + len > dst->len) {
3609 printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
3610 "len %lu dst len %lu\n", src_offset, len, dst->len);
3611 BUG_ON(1);
3612 }
3613 if (dst_offset + len > dst->len) {
3614 printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
3615 "len %lu dst len %lu\n", dst_offset, len, dst->len);
3616 BUG_ON(1);
3617 }
3618
3619 while (len > 0) {
3620 dst_off_in_page = (start_offset + dst_offset) &
3621 ((unsigned long)PAGE_CACHE_SIZE - 1);
3622 src_off_in_page = (start_offset + src_offset) &
3623 ((unsigned long)PAGE_CACHE_SIZE - 1);
3624
3625 dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
3626 src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
3627
3628 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
3629 src_off_in_page));
3630 cur = min_t(unsigned long, cur,
3631 (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
3632
3633 copy_pages(extent_buffer_page(dst, dst_i),
3634 extent_buffer_page(dst, src_i),
3635 dst_off_in_page, src_off_in_page, cur);
3636
3637 src_offset += cur;
3638 dst_offset += cur;
3639 len -= cur;
3640 }
3641}
3642
3643void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
3644 unsigned long src_offset, unsigned long len)
3645{
3646 size_t cur;
3647 size_t dst_off_in_page;
3648 size_t src_off_in_page;
3649 unsigned long dst_end = dst_offset + len - 1;
3650 unsigned long src_end = src_offset + len - 1;
3651 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3652 unsigned long dst_i;
3653 unsigned long src_i;
3654
3655 if (src_offset + len > dst->len) {
3656 printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
3657 "len %lu len %lu\n", src_offset, len, dst->len);
3658 BUG_ON(1);
3659 }
3660 if (dst_offset + len > dst->len) {
3661 printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
3662 "len %lu len %lu\n", dst_offset, len, dst->len);
3663 BUG_ON(1);
3664 }
3665 if (dst_offset < src_offset) {
3666 memcpy_extent_buffer(dst, dst_offset, src_offset, len);
3667 return;
3668 }
3669 while (len > 0) {
3670 dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
3671 src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
3672
3673 dst_off_in_page = (start_offset + dst_end) &
3674 ((unsigned long)PAGE_CACHE_SIZE - 1);
3675 src_off_in_page = (start_offset + src_end) &
3676 ((unsigned long)PAGE_CACHE_SIZE - 1);
3677
3678 cur = min_t(unsigned long, len, src_off_in_page + 1);
3679 cur = min(cur, dst_off_in_page + 1);
3680 move_pages(extent_buffer_page(dst, dst_i),
3681 extent_buffer_page(dst, src_i),
3682 dst_off_in_page - cur + 1,
3683 src_off_in_page - cur + 1, cur);
3684
3685 dst_end -= cur;
3686 src_end -= cur;
3687 len -= cur;
3688 }
3689}
3690
3691int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
3692{
3693 u64 start = page_offset(page);
3694 struct extent_buffer *eb;
3695 int ret = 1;
3696 unsigned long i;
3697 unsigned long num_pages;
3698
3699 spin_lock(&tree->buffer_lock);
3700 eb = buffer_search(tree, start);
3701 if (!eb)
3702 goto out;
3703
3704 if (atomic_read(&eb->refs) > 1) {
3705 ret = 0;
3706 goto out;
3707 }
3708 /* at this point we can safely release the extent buffer */
3709 num_pages = num_extent_pages(eb->start, eb->len);
3710 for (i = 0; i < num_pages; i++)
3711 page_cache_release(extent_buffer_page(eb, i));
3712 rb_erase(&eb->rb_node, &tree->buffer);
3713 __free_extent_buffer(eb);
3714out:
3715 spin_unlock(&tree->buffer_lock);
3716 return ret;
3717}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
new file mode 100644
index 000000000000..c5b483a79137
--- /dev/null
+++ b/fs/btrfs/extent_io.h
@@ -0,0 +1,269 @@
1#ifndef __EXTENTIO__
2#define __EXTENTIO__
3
4#include <linux/rbtree.h>
5
6/* bits for the extent state */
7#define EXTENT_DIRTY 1
8#define EXTENT_WRITEBACK (1 << 1)
9#define EXTENT_UPTODATE (1 << 2)
10#define EXTENT_LOCKED (1 << 3)
11#define EXTENT_NEW (1 << 4)
12#define EXTENT_DELALLOC (1 << 5)
13#define EXTENT_DEFRAG (1 << 6)
14#define EXTENT_DEFRAG_DONE (1 << 7)
15#define EXTENT_BUFFER_FILLED (1 << 8)
16#define EXTENT_ORDERED (1 << 9)
17#define EXTENT_ORDERED_METADATA (1 << 10)
18#define EXTENT_BOUNDARY (1 << 11)
19#define EXTENT_NODATASUM (1 << 12)
20#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
21
22/* flags for bio submission */
23#define EXTENT_BIO_COMPRESSED 1
24
25/*
26 * page->private values. Every page that is controlled by the extent
27 * map has page->private set to one.
28 */
29#define EXTENT_PAGE_PRIVATE 1
30#define EXTENT_PAGE_PRIVATE_FIRST_PAGE 3
31
32struct extent_state;
33
34typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw,
35 struct bio *bio, int mirror_num,
36 unsigned long bio_flags);
37struct extent_io_ops {
38 int (*fill_delalloc)(struct inode *inode, struct page *locked_page,
39 u64 start, u64 end, int *page_started,
40 unsigned long *nr_written);
41 int (*writepage_start_hook)(struct page *page, u64 start, u64 end);
42 int (*writepage_io_hook)(struct page *page, u64 start, u64 end);
43 extent_submit_bio_hook_t *submit_bio_hook;
44 int (*merge_bio_hook)(struct page *page, unsigned long offset,
45 size_t size, struct bio *bio,
46 unsigned long bio_flags);
47 int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
48 int (*readpage_io_failed_hook)(struct bio *bio, struct page *page,
49 u64 start, u64 end,
50 struct extent_state *state);
51 int (*writepage_io_failed_hook)(struct bio *bio, struct page *page,
52 u64 start, u64 end,
53 struct extent_state *state);
54 int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end,
55 struct extent_state *state);
56 int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
57 struct extent_state *state, int uptodate);
58 int (*set_bit_hook)(struct inode *inode, u64 start, u64 end,
59 unsigned long old, unsigned long bits);
60 int (*clear_bit_hook)(struct inode *inode, u64 start, u64 end,
61 unsigned long old, unsigned long bits);
62 int (*write_cache_pages_lock_hook)(struct page *page);
63};
64
65struct extent_io_tree {
66 struct rb_root state;
67 struct rb_root buffer;
68 struct address_space *mapping;
69 u64 dirty_bytes;
70 spinlock_t lock;
71 spinlock_t buffer_lock;
72 struct extent_io_ops *ops;
73};
74
75struct extent_state {
76 u64 start;
77 u64 end; /* inclusive */
78 struct rb_node rb_node;
79 struct extent_io_tree *tree;
80 wait_queue_head_t wq;
81 atomic_t refs;
82 unsigned long state;
83
84 /* for use by the FS */
85 u64 private;
86
87 struct list_head leak_list;
88};
89
90struct extent_buffer {
91 u64 start;
92 unsigned long len;
93 char *map_token;
94 char *kaddr;
95 unsigned long map_start;
96 unsigned long map_len;
97 struct page *first_page;
98 atomic_t refs;
99 int flags;
100 struct list_head leak_list;
101 struct rb_node rb_node;
102 struct mutex mutex;
103};
104
105struct extent_map_tree;
106
107static inline struct extent_state *extent_state_next(struct extent_state *state)
108{
109 struct rb_node *node;
110 node = rb_next(&state->rb_node);
111 if (!node)
112 return NULL;
113 return rb_entry(node, struct extent_state, rb_node);
114}
115
116typedef struct extent_map *(get_extent_t)(struct inode *inode,
117 struct page *page,
118 size_t page_offset,
119 u64 start, u64 len,
120 int create);
121
122void extent_io_tree_init(struct extent_io_tree *tree,
123 struct address_space *mapping, gfp_t mask);
124int try_release_extent_mapping(struct extent_map_tree *map,
125 struct extent_io_tree *tree, struct page *page,
126 gfp_t mask);
127int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page);
128int try_release_extent_state(struct extent_map_tree *map,
129 struct extent_io_tree *tree, struct page *page,
130 gfp_t mask);
131int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
132int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
133int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
134 gfp_t mask);
135int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
136 get_extent_t *get_extent);
137int __init extent_io_init(void);
138void extent_io_exit(void);
139
140u64 count_range_bits(struct extent_io_tree *tree,
141 u64 *start, u64 search_end,
142 u64 max_bytes, unsigned long bits);
143
144int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
145 int bits, int filled);
146int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
147 int bits, gfp_t mask);
148int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
149 int bits, int wake, int delete, gfp_t mask);
150int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
151 int bits, gfp_t mask);
152int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
153 gfp_t mask);
154int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
155 gfp_t mask);
156int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
157 gfp_t mask);
158int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
159 gfp_t mask);
160int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
161 gfp_t mask);
162int clear_extent_ordered_metadata(struct extent_io_tree *tree, u64 start,
163 u64 end, gfp_t mask);
164int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
165 gfp_t mask);
166int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
167 gfp_t mask);
168int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
169 u64 *start_ret, u64 *end_ret, int bits);
170struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
171 u64 start, int bits);
172int extent_invalidatepage(struct extent_io_tree *tree,
173 struct page *page, unsigned long offset);
174int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
175 get_extent_t *get_extent,
176 struct writeback_control *wbc);
177int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
178 u64 start, u64 end, get_extent_t *get_extent,
179 int mode);
180int extent_writepages(struct extent_io_tree *tree,
181 struct address_space *mapping,
182 get_extent_t *get_extent,
183 struct writeback_control *wbc);
184int extent_readpages(struct extent_io_tree *tree,
185 struct address_space *mapping,
186 struct list_head *pages, unsigned nr_pages,
187 get_extent_t get_extent);
188int extent_prepare_write(struct extent_io_tree *tree,
189 struct inode *inode, struct page *page,
190 unsigned from, unsigned to, get_extent_t *get_extent);
191int extent_commit_write(struct extent_io_tree *tree,
192 struct inode *inode, struct page *page,
193 unsigned from, unsigned to);
194sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
195 get_extent_t *get_extent);
196int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end);
197int set_state_private(struct extent_io_tree *tree, u64 start, u64 private);
198int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private);
199void set_page_extent_mapped(struct page *page);
200
201struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
202 u64 start, unsigned long len,
203 struct page *page0,
204 gfp_t mask);
205struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
206 u64 start, unsigned long len,
207 gfp_t mask);
208void free_extent_buffer(struct extent_buffer *eb);
209int read_extent_buffer_pages(struct extent_io_tree *tree,
210 struct extent_buffer *eb, u64 start, int wait,
211 get_extent_t *get_extent, int mirror_num);
212
213static inline void extent_buffer_get(struct extent_buffer *eb)
214{
215 atomic_inc(&eb->refs);
216}
217
218int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
219 unsigned long start,
220 unsigned long len);
221void read_extent_buffer(struct extent_buffer *eb, void *dst,
222 unsigned long start,
223 unsigned long len);
224void write_extent_buffer(struct extent_buffer *eb, const void *src,
225 unsigned long start, unsigned long len);
226void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
227 unsigned long dst_offset, unsigned long src_offset,
228 unsigned long len);
229void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
230 unsigned long src_offset, unsigned long len);
231void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
232 unsigned long src_offset, unsigned long len);
233void memset_extent_buffer(struct extent_buffer *eb, char c,
234 unsigned long start, unsigned long len);
235int wait_on_extent_buffer_writeback(struct extent_io_tree *tree,
236 struct extent_buffer *eb);
237int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end);
238int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits);
239int clear_extent_buffer_dirty(struct extent_io_tree *tree,
240 struct extent_buffer *eb);
241int set_extent_buffer_dirty(struct extent_io_tree *tree,
242 struct extent_buffer *eb);
243int set_extent_buffer_uptodate(struct extent_io_tree *tree,
244 struct extent_buffer *eb);
245int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
246 struct extent_buffer *eb);
247int extent_buffer_uptodate(struct extent_io_tree *tree,
248 struct extent_buffer *eb);
249int map_extent_buffer(struct extent_buffer *eb, unsigned long offset,
250 unsigned long min_len, char **token, char **map,
251 unsigned long *map_start,
252 unsigned long *map_len, int km);
253int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset,
254 unsigned long min_len, char **token, char **map,
255 unsigned long *map_start,
256 unsigned long *map_len, int km);
257void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km);
258int release_extent_buffer_tail_pages(struct extent_buffer *eb);
259int extent_range_uptodate(struct extent_io_tree *tree,
260 u64 start, u64 end);
261int extent_clear_unlock_delalloc(struct inode *inode,
262 struct extent_io_tree *tree,
263 u64 start, u64 end, struct page *locked_page,
264 int unlock_page,
265 int clear_unlock,
266 int clear_delalloc, int clear_dirty,
267 int set_writeback,
268 int end_writeback);
269#endif
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
new file mode 100644
index 000000000000..4a83e33ada32
--- /dev/null
+++ b/fs/btrfs/extent_map.c
@@ -0,0 +1,351 @@
1#include <linux/err.h>
2#include <linux/gfp.h>
3#include <linux/slab.h>
4#include <linux/module.h>
5#include <linux/spinlock.h>
6#include <linux/version.h>
7#include <linux/hardirq.h>
8#include "extent_map.h"
9
10/* temporary define until extent_map moves out of btrfs */
11struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
12 unsigned long extra_flags,
13 void (*ctor)(void *, struct kmem_cache *,
14 unsigned long));
15
16static struct kmem_cache *extent_map_cache;
17
18int __init extent_map_init(void)
19{
20 extent_map_cache = btrfs_cache_create("extent_map",
21 sizeof(struct extent_map), 0,
22 NULL);
23 if (!extent_map_cache)
24 return -ENOMEM;
25 return 0;
26}
27
28void extent_map_exit(void)
29{
30 if (extent_map_cache)
31 kmem_cache_destroy(extent_map_cache);
32}
33
34/**
35 * extent_map_tree_init - initialize extent map tree
36 * @tree: tree to initialize
37 * @mask: flags for memory allocations during tree operations
38 *
39 * Initialize the extent tree @tree. Should be called for each new inode
40 * or other user of the extent_map interface.
41 */
42void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask)
43{
44 tree->map.rb_node = NULL;
45 spin_lock_init(&tree->lock);
46}
47EXPORT_SYMBOL(extent_map_tree_init);
48
49/**
50 * alloc_extent_map - allocate new extent map structure
51 * @mask: memory allocation flags
52 *
53 * Allocate a new extent_map structure. The new structure is
54 * returned with a reference count of one and needs to be
55 * freed using free_extent_map()
56 */
57struct extent_map *alloc_extent_map(gfp_t mask)
58{
59 struct extent_map *em;
60 em = kmem_cache_alloc(extent_map_cache, mask);
61 if (!em || IS_ERR(em))
62 return em;
63 em->in_tree = 0;
64 em->flags = 0;
65 atomic_set(&em->refs, 1);
66 return em;
67}
68EXPORT_SYMBOL(alloc_extent_map);
69
70/**
71 * free_extent_map - drop reference count of an extent_map
72 * @em: extent map beeing releasead
73 *
74 * Drops the reference out on @em by one and free the structure
75 * if the reference count hits zero.
76 */
77void free_extent_map(struct extent_map *em)
78{
79 if (!em)
80 return;
81 WARN_ON(atomic_read(&em->refs) == 0);
82 if (atomic_dec_and_test(&em->refs)) {
83 WARN_ON(em->in_tree);
84 kmem_cache_free(extent_map_cache, em);
85 }
86}
87EXPORT_SYMBOL(free_extent_map);
88
89static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
90 struct rb_node *node)
91{
92 struct rb_node **p = &root->rb_node;
93 struct rb_node *parent = NULL;
94 struct extent_map *entry;
95
96 while (*p) {
97 parent = *p;
98 entry = rb_entry(parent, struct extent_map, rb_node);
99
100 WARN_ON(!entry->in_tree);
101
102 if (offset < entry->start)
103 p = &(*p)->rb_left;
104 else if (offset >= extent_map_end(entry))
105 p = &(*p)->rb_right;
106 else
107 return parent;
108 }
109
110 entry = rb_entry(node, struct extent_map, rb_node);
111 entry->in_tree = 1;
112 rb_link_node(node, parent, p);
113 rb_insert_color(node, root);
114 return NULL;
115}
116
117/*
118 * search through the tree for an extent_map with a given offset. If
119 * it can't be found, try to find some neighboring extents
120 */
121static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
122 struct rb_node **prev_ret,
123 struct rb_node **next_ret)
124{
125 struct rb_node *n = root->rb_node;
126 struct rb_node *prev = NULL;
127 struct rb_node *orig_prev = NULL;
128 struct extent_map *entry;
129 struct extent_map *prev_entry = NULL;
130
131 while (n) {
132 entry = rb_entry(n, struct extent_map, rb_node);
133 prev = n;
134 prev_entry = entry;
135
136 WARN_ON(!entry->in_tree);
137
138 if (offset < entry->start)
139 n = n->rb_left;
140 else if (offset >= extent_map_end(entry))
141 n = n->rb_right;
142 else
143 return n;
144 }
145
146 if (prev_ret) {
147 orig_prev = prev;
148 while (prev && offset >= extent_map_end(prev_entry)) {
149 prev = rb_next(prev);
150 prev_entry = rb_entry(prev, struct extent_map, rb_node);
151 }
152 *prev_ret = prev;
153 prev = orig_prev;
154 }
155
156 if (next_ret) {
157 prev_entry = rb_entry(prev, struct extent_map, rb_node);
158 while (prev && offset < prev_entry->start) {
159 prev = rb_prev(prev);
160 prev_entry = rb_entry(prev, struct extent_map, rb_node);
161 }
162 *next_ret = prev;
163 }
164 return NULL;
165}
166
167/*
168 * look for an offset in the tree, and if it can't be found, return
169 * the first offset we can find smaller than 'offset'.
170 */
171static inline struct rb_node *tree_search(struct rb_root *root, u64 offset)
172{
173 struct rb_node *prev;
174 struct rb_node *ret;
175 ret = __tree_search(root, offset, &prev, NULL);
176 if (!ret)
177 return prev;
178 return ret;
179}
180
181/* check to see if two extent_map structs are adjacent and safe to merge */
182static int mergable_maps(struct extent_map *prev, struct extent_map *next)
183{
184 if (test_bit(EXTENT_FLAG_PINNED, &prev->flags))
185 return 0;
186
187 /*
188 * don't merge compressed extents, we need to know their
189 * actual size
190 */
191 if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags))
192 return 0;
193
194 if (extent_map_end(prev) == next->start &&
195 prev->flags == next->flags &&
196 prev->bdev == next->bdev &&
197 ((next->block_start == EXTENT_MAP_HOLE &&
198 prev->block_start == EXTENT_MAP_HOLE) ||
199 (next->block_start == EXTENT_MAP_INLINE &&
200 prev->block_start == EXTENT_MAP_INLINE) ||
201 (next->block_start == EXTENT_MAP_DELALLOC &&
202 prev->block_start == EXTENT_MAP_DELALLOC) ||
203 (next->block_start < EXTENT_MAP_LAST_BYTE - 1 &&
204 next->block_start == extent_map_block_end(prev)))) {
205 return 1;
206 }
207 return 0;
208}
209
210/**
211 * add_extent_mapping - add new extent map to the extent tree
212 * @tree: tree to insert new map in
213 * @em: map to insert
214 *
215 * Insert @em into @tree or perform a simple forward/backward merge with
216 * existing mappings. The extent_map struct passed in will be inserted
217 * into the tree directly, with an additional reference taken, or a
218 * reference dropped if the merge attempt was sucessfull.
219 */
220int add_extent_mapping(struct extent_map_tree *tree,
221 struct extent_map *em)
222{
223 int ret = 0;
224 struct extent_map *merge = NULL;
225 struct rb_node *rb;
226 struct extent_map *exist;
227
228 exist = lookup_extent_mapping(tree, em->start, em->len);
229 if (exist) {
230 free_extent_map(exist);
231 ret = -EEXIST;
232 goto out;
233 }
234 assert_spin_locked(&tree->lock);
235 rb = tree_insert(&tree->map, em->start, &em->rb_node);
236 if (rb) {
237 ret = -EEXIST;
238 free_extent_map(merge);
239 goto out;
240 }
241 atomic_inc(&em->refs);
242 if (em->start != 0) {
243 rb = rb_prev(&em->rb_node);
244 if (rb)
245 merge = rb_entry(rb, struct extent_map, rb_node);
246 if (rb && mergable_maps(merge, em)) {
247 em->start = merge->start;
248 em->len += merge->len;
249 em->block_len += merge->block_len;
250 em->block_start = merge->block_start;
251 merge->in_tree = 0;
252 rb_erase(&merge->rb_node, &tree->map);
253 free_extent_map(merge);
254 }
255 }
256 rb = rb_next(&em->rb_node);
257 if (rb)
258 merge = rb_entry(rb, struct extent_map, rb_node);
259 if (rb && mergable_maps(em, merge)) {
260 em->len += merge->len;
261 em->block_len += merge->len;
262 rb_erase(&merge->rb_node, &tree->map);
263 merge->in_tree = 0;
264 free_extent_map(merge);
265 }
266out:
267 return ret;
268}
269EXPORT_SYMBOL(add_extent_mapping);
270
271/* simple helper to do math around the end of an extent, handling wrap */
272static u64 range_end(u64 start, u64 len)
273{
274 if (start + len < start)
275 return (u64)-1;
276 return start + len;
277}
278
279/**
280 * lookup_extent_mapping - lookup extent_map
281 * @tree: tree to lookup in
282 * @start: byte offset to start the search
283 * @len: length of the lookup range
284 *
285 * Find and return the first extent_map struct in @tree that intersects the
286 * [start, len] range. There may be additional objects in the tree that
287 * intersect, so check the object returned carefully to make sure that no
288 * additional lookups are needed.
289 */
290struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
291 u64 start, u64 len)
292{
293 struct extent_map *em;
294 struct rb_node *rb_node;
295 struct rb_node *prev = NULL;
296 struct rb_node *next = NULL;
297 u64 end = range_end(start, len);
298
299 assert_spin_locked(&tree->lock);
300 rb_node = __tree_search(&tree->map, start, &prev, &next);
301 if (!rb_node && prev) {
302 em = rb_entry(prev, struct extent_map, rb_node);
303 if (end > em->start && start < extent_map_end(em))
304 goto found;
305 }
306 if (!rb_node && next) {
307 em = rb_entry(next, struct extent_map, rb_node);
308 if (end > em->start && start < extent_map_end(em))
309 goto found;
310 }
311 if (!rb_node) {
312 em = NULL;
313 goto out;
314 }
315 if (IS_ERR(rb_node)) {
316 em = ERR_PTR(PTR_ERR(rb_node));
317 goto out;
318 }
319 em = rb_entry(rb_node, struct extent_map, rb_node);
320 if (end > em->start && start < extent_map_end(em))
321 goto found;
322
323 em = NULL;
324 goto out;
325
326found:
327 atomic_inc(&em->refs);
328out:
329 return em;
330}
331EXPORT_SYMBOL(lookup_extent_mapping);
332
333/**
334 * remove_extent_mapping - removes an extent_map from the extent tree
335 * @tree: extent tree to remove from
336 * @em: extent map beeing removed
337 *
338 * Removes @em from @tree. No reference counts are dropped, and no checks
339 * are done to see if the range is in use
340 */
341int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
342{
343 int ret = 0;
344
345 WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
346 assert_spin_locked(&tree->lock);
347 rb_erase(&em->rb_node, &tree->map);
348 em->in_tree = 0;
349 return ret;
350}
351EXPORT_SYMBOL(remove_extent_mapping);
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
new file mode 100644
index 000000000000..fb6eeef06bb0
--- /dev/null
+++ b/fs/btrfs/extent_map.h
@@ -0,0 +1,62 @@
1#ifndef __EXTENTMAP__
2#define __EXTENTMAP__
3
4#include <linux/rbtree.h>
5
6#define EXTENT_MAP_LAST_BYTE (u64)-4
7#define EXTENT_MAP_HOLE (u64)-3
8#define EXTENT_MAP_INLINE (u64)-2
9#define EXTENT_MAP_DELALLOC (u64)-1
10
11/* bits for the flags field */
12#define EXTENT_FLAG_PINNED 0 /* this entry not yet on disk, don't free it */
13#define EXTENT_FLAG_COMPRESSED 1
14#define EXTENT_FLAG_VACANCY 2 /* no file extent item found */
15#define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */
16
17struct extent_map {
18 struct rb_node rb_node;
19
20 /* all of these are in bytes */
21 u64 start;
22 u64 len;
23 u64 orig_start;
24 u64 block_start;
25 u64 block_len;
26 unsigned long flags;
27 struct block_device *bdev;
28 atomic_t refs;
29 int in_tree;
30};
31
32struct extent_map_tree {
33 struct rb_root map;
34 spinlock_t lock;
35};
36
37static inline u64 extent_map_end(struct extent_map *em)
38{
39 if (em->start + em->len < em->start)
40 return (u64)-1;
41 return em->start + em->len;
42}
43
44static inline u64 extent_map_block_end(struct extent_map *em)
45{
46 if (em->block_start + em->block_len < em->block_start)
47 return (u64)-1;
48 return em->block_start + em->block_len;
49}
50
51void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask);
52struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
53 u64 start, u64 len);
54int add_extent_mapping(struct extent_map_tree *tree,
55 struct extent_map *em);
56int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em);
57
58struct extent_map *alloc_extent_map(gfp_t mask);
59void free_extent_map(struct extent_map *em);
60int __init extent_map_init(void);
61void extent_map_exit(void);
62#endif
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
new file mode 100644
index 000000000000..964652435fd1
--- /dev/null
+++ b/fs/btrfs/file-item.c
@@ -0,0 +1,831 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/bio.h>
20#include <linux/pagemap.h>
21#include <linux/highmem.h>
22#include "ctree.h"
23#include "disk-io.h"
24#include "transaction.h"
25#include "print-tree.h"
26
27#define MAX_CSUM_ITEMS(r, size) ((((BTRFS_LEAF_DATA_SIZE(r) - \
28 sizeof(struct btrfs_item) * 2) / \
29 size) - 1))
30
31#define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \
32 sizeof(struct btrfs_ordered_sum)) / \
33 sizeof(struct btrfs_sector_sum) * \
34 (r)->sectorsize - (r)->sectorsize)
35
36int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
37 struct btrfs_root *root,
38 u64 objectid, u64 pos,
39 u64 disk_offset, u64 disk_num_bytes,
40 u64 num_bytes, u64 offset, u64 ram_bytes,
41 u8 compression, u8 encryption, u16 other_encoding)
42{
43 int ret = 0;
44 struct btrfs_file_extent_item *item;
45 struct btrfs_key file_key;
46 struct btrfs_path *path;
47 struct extent_buffer *leaf;
48
49 path = btrfs_alloc_path();
50 BUG_ON(!path);
51 file_key.objectid = objectid;
52 file_key.offset = pos;
53 btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
54
55 ret = btrfs_insert_empty_item(trans, root, path, &file_key,
56 sizeof(*item));
57 if (ret < 0)
58 goto out;
59 BUG_ON(ret);
60 leaf = path->nodes[0];
61 item = btrfs_item_ptr(leaf, path->slots[0],
62 struct btrfs_file_extent_item);
63 btrfs_set_file_extent_disk_bytenr(leaf, item, disk_offset);
64 btrfs_set_file_extent_disk_num_bytes(leaf, item, disk_num_bytes);
65 btrfs_set_file_extent_offset(leaf, item, offset);
66 btrfs_set_file_extent_num_bytes(leaf, item, num_bytes);
67 btrfs_set_file_extent_ram_bytes(leaf, item, ram_bytes);
68 btrfs_set_file_extent_generation(leaf, item, trans->transid);
69 btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
70 btrfs_set_file_extent_compression(leaf, item, compression);
71 btrfs_set_file_extent_encryption(leaf, item, encryption);
72 btrfs_set_file_extent_other_encoding(leaf, item, other_encoding);
73
74 btrfs_mark_buffer_dirty(leaf);
75out:
76 btrfs_free_path(path);
77 return ret;
78}
79
80struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
81 struct btrfs_root *root,
82 struct btrfs_path *path,
83 u64 bytenr, int cow)
84{
85 int ret;
86 struct btrfs_key file_key;
87 struct btrfs_key found_key;
88 struct btrfs_csum_item *item;
89 struct extent_buffer *leaf;
90 u64 csum_offset = 0;
91 u16 csum_size =
92 btrfs_super_csum_size(&root->fs_info->super_copy);
93 int csums_in_item;
94
95 file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
96 file_key.offset = bytenr;
97 btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY);
98 ret = btrfs_search_slot(trans, root, &file_key, path, 0, cow);
99 if (ret < 0)
100 goto fail;
101 leaf = path->nodes[0];
102 if (ret > 0) {
103 ret = 1;
104 if (path->slots[0] == 0)
105 goto fail;
106 path->slots[0]--;
107 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
108 if (btrfs_key_type(&found_key) != BTRFS_EXTENT_CSUM_KEY)
109 goto fail;
110
111 csum_offset = (bytenr - found_key.offset) >>
112 root->fs_info->sb->s_blocksize_bits;
113 csums_in_item = btrfs_item_size_nr(leaf, path->slots[0]);
114 csums_in_item /= csum_size;
115
116 if (csum_offset >= csums_in_item) {
117 ret = -EFBIG;
118 goto fail;
119 }
120 }
121 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
122 item = (struct btrfs_csum_item *)((unsigned char *)item +
123 csum_offset * csum_size);
124 return item;
125fail:
126 if (ret > 0)
127 ret = -ENOENT;
128 return ERR_PTR(ret);
129}
130
131
132int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
133 struct btrfs_root *root,
134 struct btrfs_path *path, u64 objectid,
135 u64 offset, int mod)
136{
137 int ret;
138 struct btrfs_key file_key;
139 int ins_len = mod < 0 ? -1 : 0;
140 int cow = mod != 0;
141
142 file_key.objectid = objectid;
143 file_key.offset = offset;
144 btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
145 ret = btrfs_search_slot(trans, root, &file_key, path, ins_len, cow);
146 return ret;
147}
148
149
150int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
151 struct bio *bio, u32 *dst)
152{
153 u32 sum;
154 struct bio_vec *bvec = bio->bi_io_vec;
155 int bio_index = 0;
156 u64 offset;
157 u64 item_start_offset = 0;
158 u64 item_last_offset = 0;
159 u64 disk_bytenr;
160 u32 diff;
161 u16 csum_size =
162 btrfs_super_csum_size(&root->fs_info->super_copy);
163 int ret;
164 struct btrfs_path *path;
165 struct btrfs_csum_item *item = NULL;
166 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
167
168 path = btrfs_alloc_path();
169 if (bio->bi_size > PAGE_CACHE_SIZE * 8)
170 path->reada = 2;
171
172 WARN_ON(bio->bi_vcnt <= 0);
173
174 disk_bytenr = (u64)bio->bi_sector << 9;
175 while (bio_index < bio->bi_vcnt) {
176 offset = page_offset(bvec->bv_page) + bvec->bv_offset;
177 ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum);
178 if (ret == 0)
179 goto found;
180
181 if (!item || disk_bytenr < item_start_offset ||
182 disk_bytenr >= item_last_offset) {
183 struct btrfs_key found_key;
184 u32 item_size;
185
186 if (item)
187 btrfs_release_path(root, path);
188 item = btrfs_lookup_csum(NULL, root->fs_info->csum_root,
189 path, disk_bytenr, 0);
190 if (IS_ERR(item)) {
191 ret = PTR_ERR(item);
192 if (ret == -ENOENT || ret == -EFBIG)
193 ret = 0;
194 sum = 0;
195 if (BTRFS_I(inode)->root->root_key.objectid ==
196 BTRFS_DATA_RELOC_TREE_OBJECTID) {
197 set_extent_bits(io_tree, offset,
198 offset + bvec->bv_len - 1,
199 EXTENT_NODATASUM, GFP_NOFS);
200 } else {
201 printk(KERN_INFO "btrfs no csum found "
202 "for inode %lu start %llu\n",
203 inode->i_ino,
204 (unsigned long long)offset);
205 }
206 item = NULL;
207 btrfs_release_path(root, path);
208 goto found;
209 }
210 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
211 path->slots[0]);
212
213 item_start_offset = found_key.offset;
214 item_size = btrfs_item_size_nr(path->nodes[0],
215 path->slots[0]);
216 item_last_offset = item_start_offset +
217 (item_size / csum_size) *
218 root->sectorsize;
219 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
220 struct btrfs_csum_item);
221 }
222 /*
223 * this byte range must be able to fit inside
224 * a single leaf so it will also fit inside a u32
225 */
226 diff = disk_bytenr - item_start_offset;
227 diff = diff / root->sectorsize;
228 diff = diff * csum_size;
229
230 read_extent_buffer(path->nodes[0], &sum,
231 ((unsigned long)item) + diff,
232 csum_size);
233found:
234 if (dst)
235 *dst++ = sum;
236 else
237 set_state_private(io_tree, offset, sum);
238 disk_bytenr += bvec->bv_len;
239 bio_index++;
240 bvec++;
241 }
242 btrfs_free_path(path);
243 return 0;
244}
245
246int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
247 struct list_head *list)
248{
249 struct btrfs_key key;
250 struct btrfs_path *path;
251 struct extent_buffer *leaf;
252 struct btrfs_ordered_sum *sums;
253 struct btrfs_sector_sum *sector_sum;
254 struct btrfs_csum_item *item;
255 unsigned long offset;
256 int ret;
257 size_t size;
258 u64 csum_end;
259 u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy);
260
261 path = btrfs_alloc_path();
262 BUG_ON(!path);
263
264 key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
265 key.offset = start;
266 key.type = BTRFS_EXTENT_CSUM_KEY;
267
268 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
269 if (ret < 0)
270 goto fail;
271 if (ret > 0 && path->slots[0] > 0) {
272 leaf = path->nodes[0];
273 btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
274 if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID &&
275 key.type == BTRFS_EXTENT_CSUM_KEY) {
276 offset = (start - key.offset) >>
277 root->fs_info->sb->s_blocksize_bits;
278 if (offset * csum_size <
279 btrfs_item_size_nr(leaf, path->slots[0] - 1))
280 path->slots[0]--;
281 }
282 }
283
284 while (start <= end) {
285 leaf = path->nodes[0];
286 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
287 ret = btrfs_next_leaf(root, path);
288 if (ret < 0)
289 goto fail;
290 if (ret > 0)
291 break;
292 leaf = path->nodes[0];
293 }
294
295 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
296 if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
297 key.type != BTRFS_EXTENT_CSUM_KEY)
298 break;
299
300 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
301 if (key.offset > end)
302 break;
303
304 if (key.offset > start)
305 start = key.offset;
306
307 size = btrfs_item_size_nr(leaf, path->slots[0]);
308 csum_end = key.offset + (size / csum_size) * root->sectorsize;
309 if (csum_end <= start) {
310 path->slots[0]++;
311 continue;
312 }
313
314 csum_end = min(csum_end, end + 1);
315 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
316 struct btrfs_csum_item);
317 while (start < csum_end) {
318 size = min_t(size_t, csum_end - start,
319 MAX_ORDERED_SUM_BYTES(root));
320 sums = kzalloc(btrfs_ordered_sum_size(root, size),
321 GFP_NOFS);
322 BUG_ON(!sums);
323
324 sector_sum = sums->sums;
325 sums->bytenr = start;
326 sums->len = size;
327
328 offset = (start - key.offset) >>
329 root->fs_info->sb->s_blocksize_bits;
330 offset *= csum_size;
331
332 while (size > 0) {
333 read_extent_buffer(path->nodes[0],
334 &sector_sum->sum,
335 ((unsigned long)item) +
336 offset, csum_size);
337 sector_sum->bytenr = start;
338
339 size -= root->sectorsize;
340 start += root->sectorsize;
341 offset += csum_size;
342 sector_sum++;
343 }
344 list_add_tail(&sums->list, list);
345 }
346 path->slots[0]++;
347 }
348 ret = 0;
349fail:
350 btrfs_free_path(path);
351 return ret;
352}
353
354int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
355 struct bio *bio, u64 file_start, int contig)
356{
357 struct btrfs_ordered_sum *sums;
358 struct btrfs_sector_sum *sector_sum;
359 struct btrfs_ordered_extent *ordered;
360 char *data;
361 struct bio_vec *bvec = bio->bi_io_vec;
362 int bio_index = 0;
363 unsigned long total_bytes = 0;
364 unsigned long this_sum_bytes = 0;
365 u64 offset;
366 u64 disk_bytenr;
367
368 WARN_ON(bio->bi_vcnt <= 0);
369 sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_size), GFP_NOFS);
370 if (!sums)
371 return -ENOMEM;
372
373 sector_sum = sums->sums;
374 disk_bytenr = (u64)bio->bi_sector << 9;
375 sums->len = bio->bi_size;
376 INIT_LIST_HEAD(&sums->list);
377
378 if (contig)
379 offset = file_start;
380 else
381 offset = page_offset(bvec->bv_page) + bvec->bv_offset;
382
383 ordered = btrfs_lookup_ordered_extent(inode, offset);
384 BUG_ON(!ordered);
385 sums->bytenr = ordered->start;
386
387 while (bio_index < bio->bi_vcnt) {
388 if (!contig)
389 offset = page_offset(bvec->bv_page) + bvec->bv_offset;
390
391 if (!contig && (offset >= ordered->file_offset + ordered->len ||
392 offset < ordered->file_offset)) {
393 unsigned long bytes_left;
394 sums->len = this_sum_bytes;
395 this_sum_bytes = 0;
396 btrfs_add_ordered_sum(inode, ordered, sums);
397 btrfs_put_ordered_extent(ordered);
398
399 bytes_left = bio->bi_size - total_bytes;
400
401 sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left),
402 GFP_NOFS);
403 BUG_ON(!sums);
404 sector_sum = sums->sums;
405 sums->len = bytes_left;
406 ordered = btrfs_lookup_ordered_extent(inode, offset);
407 BUG_ON(!ordered);
408 sums->bytenr = ordered->start;
409 }
410
411 data = kmap_atomic(bvec->bv_page, KM_USER0);
412 sector_sum->sum = ~(u32)0;
413 sector_sum->sum = btrfs_csum_data(root,
414 data + bvec->bv_offset,
415 sector_sum->sum,
416 bvec->bv_len);
417 kunmap_atomic(data, KM_USER0);
418 btrfs_csum_final(sector_sum->sum,
419 (char *)&sector_sum->sum);
420 sector_sum->bytenr = disk_bytenr;
421
422 sector_sum++;
423 bio_index++;
424 total_bytes += bvec->bv_len;
425 this_sum_bytes += bvec->bv_len;
426 disk_bytenr += bvec->bv_len;
427 offset += bvec->bv_len;
428 bvec++;
429 }
430 this_sum_bytes = 0;
431 btrfs_add_ordered_sum(inode, ordered, sums);
432 btrfs_put_ordered_extent(ordered);
433 return 0;
434}
435
436/*
437 * helper function for csum removal, this expects the
438 * key to describe the csum pointed to by the path, and it expects
439 * the csum to overlap the range [bytenr, len]
440 *
441 * The csum should not be entirely contained in the range and the
442 * range should not be entirely contained in the csum.
443 *
444 * This calls btrfs_truncate_item with the correct args based on the
445 * overlap, and fixes up the key as required.
446 */
447static noinline int truncate_one_csum(struct btrfs_trans_handle *trans,
448 struct btrfs_root *root,
449 struct btrfs_path *path,
450 struct btrfs_key *key,
451 u64 bytenr, u64 len)
452{
453 struct extent_buffer *leaf;
454 u16 csum_size =
455 btrfs_super_csum_size(&root->fs_info->super_copy);
456 u64 csum_end;
457 u64 end_byte = bytenr + len;
458 u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits;
459 int ret;
460
461 leaf = path->nodes[0];
462 csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size;
463 csum_end <<= root->fs_info->sb->s_blocksize_bits;
464 csum_end += key->offset;
465
466 if (key->offset < bytenr && csum_end <= end_byte) {
467 /*
468 * [ bytenr - len ]
469 * [ ]
470 * [csum ]
471 * A simple truncate off the end of the item
472 */
473 u32 new_size = (bytenr - key->offset) >> blocksize_bits;
474 new_size *= csum_size;
475 ret = btrfs_truncate_item(trans, root, path, new_size, 1);
476 BUG_ON(ret);
477 } else if (key->offset >= bytenr && csum_end > end_byte &&
478 end_byte > key->offset) {
479 /*
480 * [ bytenr - len ]
481 * [ ]
482 * [csum ]
483 * we need to truncate from the beginning of the csum
484 */
485 u32 new_size = (csum_end - end_byte) >> blocksize_bits;
486 new_size *= csum_size;
487
488 ret = btrfs_truncate_item(trans, root, path, new_size, 0);
489 BUG_ON(ret);
490
491 key->offset = end_byte;
492 ret = btrfs_set_item_key_safe(trans, root, path, key);
493 BUG_ON(ret);
494 } else {
495 BUG();
496 }
497 return 0;
498}
499
500/*
501 * deletes the csum items from the csum tree for a given
502 * range of bytes.
503 */
504int btrfs_del_csums(struct btrfs_trans_handle *trans,
505 struct btrfs_root *root, u64 bytenr, u64 len)
506{
507 struct btrfs_path *path;
508 struct btrfs_key key;
509 u64 end_byte = bytenr + len;
510 u64 csum_end;
511 struct extent_buffer *leaf;
512 int ret;
513 u16 csum_size =
514 btrfs_super_csum_size(&root->fs_info->super_copy);
515 int blocksize_bits = root->fs_info->sb->s_blocksize_bits;
516
517 root = root->fs_info->csum_root;
518
519 path = btrfs_alloc_path();
520
521 while (1) {
522 key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
523 key.offset = end_byte - 1;
524 key.type = BTRFS_EXTENT_CSUM_KEY;
525
526 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
527 if (ret > 0) {
528 if (path->slots[0] == 0)
529 goto out;
530 path->slots[0]--;
531 }
532 leaf = path->nodes[0];
533 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
534
535 if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
536 key.type != BTRFS_EXTENT_CSUM_KEY) {
537 break;
538 }
539
540 if (key.offset >= end_byte)
541 break;
542
543 csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size;
544 csum_end <<= blocksize_bits;
545 csum_end += key.offset;
546
547 /* this csum ends before we start, we're done */
548 if (csum_end <= bytenr)
549 break;
550
551 /* delete the entire item, it is inside our range */
552 if (key.offset >= bytenr && csum_end <= end_byte) {
553 ret = btrfs_del_item(trans, root, path);
554 BUG_ON(ret);
555 if (key.offset == bytenr)
556 break;
557 } else if (key.offset < bytenr && csum_end > end_byte) {
558 unsigned long offset;
559 unsigned long shift_len;
560 unsigned long item_offset;
561 /*
562 * [ bytenr - len ]
563 * [csum ]
564 *
565 * Our bytes are in the middle of the csum,
566 * we need to split this item and insert a new one.
567 *
568 * But we can't drop the path because the
569 * csum could change, get removed, extended etc.
570 *
571 * The trick here is the max size of a csum item leaves
572 * enough room in the tree block for a single
573 * item header. So, we split the item in place,
574 * adding a new header pointing to the existing
575 * bytes. Then we loop around again and we have
576 * a nicely formed csum item that we can neatly
577 * truncate.
578 */
579 offset = (bytenr - key.offset) >> blocksize_bits;
580 offset *= csum_size;
581
582 shift_len = (len >> blocksize_bits) * csum_size;
583
584 item_offset = btrfs_item_ptr_offset(leaf,
585 path->slots[0]);
586
587 memset_extent_buffer(leaf, 0, item_offset + offset,
588 shift_len);
589 key.offset = bytenr;
590
591 /*
592 * btrfs_split_item returns -EAGAIN when the
593 * item changed size or key
594 */
595 ret = btrfs_split_item(trans, root, path, &key, offset);
596 BUG_ON(ret && ret != -EAGAIN);
597
598 key.offset = end_byte - 1;
599 } else {
600 ret = truncate_one_csum(trans, root, path,
601 &key, bytenr, len);
602 BUG_ON(ret);
603 if (key.offset < bytenr)
604 break;
605 }
606 btrfs_release_path(root, path);
607 }
608out:
609 btrfs_free_path(path);
610 return 0;
611}
612
613int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
614 struct btrfs_root *root,
615 struct btrfs_ordered_sum *sums)
616{
617 u64 bytenr;
618 int ret;
619 struct btrfs_key file_key;
620 struct btrfs_key found_key;
621 u64 next_offset;
622 u64 total_bytes = 0;
623 int found_next;
624 struct btrfs_path *path;
625 struct btrfs_csum_item *item;
626 struct btrfs_csum_item *item_end;
627 struct extent_buffer *leaf = NULL;
628 u64 csum_offset;
629 struct btrfs_sector_sum *sector_sum;
630 u32 nritems;
631 u32 ins_size;
632 char *eb_map;
633 char *eb_token;
634 unsigned long map_len;
635 unsigned long map_start;
636 u16 csum_size =
637 btrfs_super_csum_size(&root->fs_info->super_copy);
638
639 path = btrfs_alloc_path();
640 BUG_ON(!path);
641 sector_sum = sums->sums;
642again:
643 next_offset = (u64)-1;
644 found_next = 0;
645 file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
646 file_key.offset = sector_sum->bytenr;
647 bytenr = sector_sum->bytenr;
648 btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY);
649
650 item = btrfs_lookup_csum(trans, root, path, sector_sum->bytenr, 1);
651 if (!IS_ERR(item)) {
652 leaf = path->nodes[0];
653 ret = 0;
654 goto found;
655 }
656 ret = PTR_ERR(item);
657 if (ret == -EFBIG) {
658 u32 item_size;
659 /* we found one, but it isn't big enough yet */
660 leaf = path->nodes[0];
661 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
662 if ((item_size / csum_size) >=
663 MAX_CSUM_ITEMS(root, csum_size)) {
664 /* already at max size, make a new one */
665 goto insert;
666 }
667 } else {
668 int slot = path->slots[0] + 1;
669 /* we didn't find a csum item, insert one */
670 nritems = btrfs_header_nritems(path->nodes[0]);
671 if (path->slots[0] >= nritems - 1) {
672 ret = btrfs_next_leaf(root, path);
673 if (ret == 1)
674 found_next = 1;
675 if (ret != 0)
676 goto insert;
677 slot = 0;
678 }
679 btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot);
680 if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
681 found_key.type != BTRFS_EXTENT_CSUM_KEY) {
682 found_next = 1;
683 goto insert;
684 }
685 next_offset = found_key.offset;
686 found_next = 1;
687 goto insert;
688 }
689
690 /*
691 * at this point, we know the tree has an item, but it isn't big
692 * enough yet to put our csum in. Grow it
693 */
694 btrfs_release_path(root, path);
695 ret = btrfs_search_slot(trans, root, &file_key, path,
696 csum_size, 1);
697 if (ret < 0)
698 goto fail_unlock;
699
700 if (ret > 0) {
701 if (path->slots[0] == 0)
702 goto insert;
703 path->slots[0]--;
704 }
705
706 leaf = path->nodes[0];
707 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
708 csum_offset = (bytenr - found_key.offset) >>
709 root->fs_info->sb->s_blocksize_bits;
710
711 if (btrfs_key_type(&found_key) != BTRFS_EXTENT_CSUM_KEY ||
712 found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
713 csum_offset >= MAX_CSUM_ITEMS(root, csum_size)) {
714 goto insert;
715 }
716
717 if (csum_offset >= btrfs_item_size_nr(leaf, path->slots[0]) /
718 csum_size) {
719 u32 diff = (csum_offset + 1) * csum_size;
720
721 /*
722 * is the item big enough already? we dropped our lock
723 * before and need to recheck
724 */
725 if (diff < btrfs_item_size_nr(leaf, path->slots[0]))
726 goto csum;
727
728 diff = diff - btrfs_item_size_nr(leaf, path->slots[0]);
729 if (diff != csum_size)
730 goto insert;
731
732 ret = btrfs_extend_item(trans, root, path, diff);
733 BUG_ON(ret);
734 goto csum;
735 }
736
737insert:
738 btrfs_release_path(root, path);
739 csum_offset = 0;
740 if (found_next) {
741 u64 tmp = total_bytes + root->sectorsize;
742 u64 next_sector = sector_sum->bytenr;
743 struct btrfs_sector_sum *next = sector_sum + 1;
744
745 while (tmp < sums->len) {
746 if (next_sector + root->sectorsize != next->bytenr)
747 break;
748 tmp += root->sectorsize;
749 next_sector = next->bytenr;
750 next++;
751 }
752 tmp = min(tmp, next_offset - file_key.offset);
753 tmp >>= root->fs_info->sb->s_blocksize_bits;
754 tmp = max((u64)1, tmp);
755 tmp = min(tmp, (u64)MAX_CSUM_ITEMS(root, csum_size));
756 ins_size = csum_size * tmp;
757 } else {
758 ins_size = csum_size;
759 }
760 ret = btrfs_insert_empty_item(trans, root, path, &file_key,
761 ins_size);
762 if (ret < 0)
763 goto fail_unlock;
764 if (ret != 0) {
765 WARN_ON(1);
766 goto fail_unlock;
767 }
768csum:
769 leaf = path->nodes[0];
770 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
771 ret = 0;
772 item = (struct btrfs_csum_item *)((unsigned char *)item +
773 csum_offset * csum_size);
774found:
775 item_end = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
776 item_end = (struct btrfs_csum_item *)((unsigned char *)item_end +
777 btrfs_item_size_nr(leaf, path->slots[0]));
778 eb_token = NULL;
779 cond_resched();
780next_sector:
781
782 if (!eb_token ||
783 (unsigned long)item + csum_size >= map_start + map_len) {
784 int err;
785
786 if (eb_token)
787 unmap_extent_buffer(leaf, eb_token, KM_USER1);
788 eb_token = NULL;
789 err = map_private_extent_buffer(leaf, (unsigned long)item,
790 csum_size,
791 &eb_token, &eb_map,
792 &map_start, &map_len, KM_USER1);
793 if (err)
794 eb_token = NULL;
795 }
796 if (eb_token) {
797 memcpy(eb_token + ((unsigned long)item & (PAGE_CACHE_SIZE - 1)),
798 &sector_sum->sum, csum_size);
799 } else {
800 write_extent_buffer(leaf, &sector_sum->sum,
801 (unsigned long)item, csum_size);
802 }
803
804 total_bytes += root->sectorsize;
805 sector_sum++;
806 if (total_bytes < sums->len) {
807 item = (struct btrfs_csum_item *)((char *)item +
808 csum_size);
809 if (item < item_end && bytenr + PAGE_CACHE_SIZE ==
810 sector_sum->bytenr) {
811 bytenr = sector_sum->bytenr;
812 goto next_sector;
813 }
814 }
815 if (eb_token) {
816 unmap_extent_buffer(leaf, eb_token, KM_USER1);
817 eb_token = NULL;
818 }
819 btrfs_mark_buffer_dirty(path->nodes[0]);
820 cond_resched();
821 if (total_bytes < sums->len) {
822 btrfs_release_path(root, path);
823 goto again;
824 }
825out:
826 btrfs_free_path(path);
827 return ret;
828
829fail_unlock:
830 goto out;
831}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
new file mode 100644
index 000000000000..90268334145e
--- /dev/null
+++ b/fs/btrfs/file.c
@@ -0,0 +1,1288 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/fs.h>
20#include <linux/pagemap.h>
21#include <linux/highmem.h>
22#include <linux/time.h>
23#include <linux/init.h>
24#include <linux/string.h>
25#include <linux/smp_lock.h>
26#include <linux/backing-dev.h>
27#include <linux/mpage.h>
28#include <linux/swap.h>
29#include <linux/writeback.h>
30#include <linux/statfs.h>
31#include <linux/compat.h>
32#include <linux/version.h>
33#include "ctree.h"
34#include "disk-io.h"
35#include "transaction.h"
36#include "btrfs_inode.h"
37#include "ioctl.h"
38#include "print-tree.h"
39#include "tree-log.h"
40#include "locking.h"
41#include "compat.h"
42
43
44/* simple helper to fault in pages and copy. This should go away
45 * and be replaced with calls into generic code.
46 */
47static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
48 int write_bytes,
49 struct page **prepared_pages,
50 const char __user *buf)
51{
52 long page_fault = 0;
53 int i;
54 int offset = pos & (PAGE_CACHE_SIZE - 1);
55
56 for (i = 0; i < num_pages && write_bytes > 0; i++, offset = 0) {
57 size_t count = min_t(size_t,
58 PAGE_CACHE_SIZE - offset, write_bytes);
59 struct page *page = prepared_pages[i];
60 fault_in_pages_readable(buf, count);
61
62 /* Copy data from userspace to the current page */
63 kmap(page);
64 page_fault = __copy_from_user(page_address(page) + offset,
65 buf, count);
66 /* Flush processor's dcache for this page */
67 flush_dcache_page(page);
68 kunmap(page);
69 buf += count;
70 write_bytes -= count;
71
72 if (page_fault)
73 break;
74 }
75 return page_fault ? -EFAULT : 0;
76}
77
78/*
79 * unlocks pages after btrfs_file_write is done with them
80 */
81static noinline void btrfs_drop_pages(struct page **pages, size_t num_pages)
82{
83 size_t i;
84 for (i = 0; i < num_pages; i++) {
85 if (!pages[i])
86 break;
87 /* page checked is some magic around finding pages that
88 * have been modified without going through btrfs_set_page_dirty
89 * clear it here
90 */
91 ClearPageChecked(pages[i]);
92 unlock_page(pages[i]);
93 mark_page_accessed(pages[i]);
94 page_cache_release(pages[i]);
95 }
96}
97
98/*
99 * after copy_from_user, pages need to be dirtied and we need to make
100 * sure holes are created between the current EOF and the start of
101 * any next extents (if required).
102 *
103 * this also makes the decision about creating an inline extent vs
104 * doing real data extents, marking pages dirty and delalloc as required.
105 */
106static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
107 struct btrfs_root *root,
108 struct file *file,
109 struct page **pages,
110 size_t num_pages,
111 loff_t pos,
112 size_t write_bytes)
113{
114 int err = 0;
115 int i;
116 struct inode *inode = fdentry(file)->d_inode;
117 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
118 u64 hint_byte;
119 u64 num_bytes;
120 u64 start_pos;
121 u64 end_of_last_block;
122 u64 end_pos = pos + write_bytes;
123 loff_t isize = i_size_read(inode);
124
125 start_pos = pos & ~((u64)root->sectorsize - 1);
126 num_bytes = (write_bytes + pos - start_pos +
127 root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
128
129 end_of_last_block = start_pos + num_bytes - 1;
130
131 lock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
132 trans = btrfs_join_transaction(root, 1);
133 if (!trans) {
134 err = -ENOMEM;
135 goto out_unlock;
136 }
137 btrfs_set_trans_block_group(trans, inode);
138 hint_byte = 0;
139
140 set_extent_uptodate(io_tree, start_pos, end_of_last_block, GFP_NOFS);
141
142 /* check for reserved extents on each page, we don't want
143 * to reset the delalloc bit on things that already have
144 * extents reserved.
145 */
146 btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
147 for (i = 0; i < num_pages; i++) {
148 struct page *p = pages[i];
149 SetPageUptodate(p);
150 ClearPageChecked(p);
151 set_page_dirty(p);
152 }
153 if (end_pos > isize) {
154 i_size_write(inode, end_pos);
155 btrfs_update_inode(trans, root, inode);
156 }
157 err = btrfs_end_transaction(trans, root);
158out_unlock:
159 unlock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
160 return err;
161}
162
163/*
164 * this drops all the extents in the cache that intersect the range
165 * [start, end]. Existing extents are split as required.
166 */
167int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
168 int skip_pinned)
169{
170 struct extent_map *em;
171 struct extent_map *split = NULL;
172 struct extent_map *split2 = NULL;
173 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
174 u64 len = end - start + 1;
175 int ret;
176 int testend = 1;
177 unsigned long flags;
178 int compressed = 0;
179
180 WARN_ON(end < start);
181 if (end == (u64)-1) {
182 len = (u64)-1;
183 testend = 0;
184 }
185 while (1) {
186 if (!split)
187 split = alloc_extent_map(GFP_NOFS);
188 if (!split2)
189 split2 = alloc_extent_map(GFP_NOFS);
190
191 spin_lock(&em_tree->lock);
192 em = lookup_extent_mapping(em_tree, start, len);
193 if (!em) {
194 spin_unlock(&em_tree->lock);
195 break;
196 }
197 flags = em->flags;
198 if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
199 spin_unlock(&em_tree->lock);
200 if (em->start <= start &&
201 (!testend || em->start + em->len >= start + len)) {
202 free_extent_map(em);
203 break;
204 }
205 if (start < em->start) {
206 len = em->start - start;
207 } else {
208 len = start + len - (em->start + em->len);
209 start = em->start + em->len;
210 }
211 free_extent_map(em);
212 continue;
213 }
214 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
215 clear_bit(EXTENT_FLAG_PINNED, &em->flags);
216 remove_extent_mapping(em_tree, em);
217
218 if (em->block_start < EXTENT_MAP_LAST_BYTE &&
219 em->start < start) {
220 split->start = em->start;
221 split->len = start - em->start;
222 split->orig_start = em->orig_start;
223 split->block_start = em->block_start;
224
225 if (compressed)
226 split->block_len = em->block_len;
227 else
228 split->block_len = split->len;
229
230 split->bdev = em->bdev;
231 split->flags = flags;
232 ret = add_extent_mapping(em_tree, split);
233 BUG_ON(ret);
234 free_extent_map(split);
235 split = split2;
236 split2 = NULL;
237 }
238 if (em->block_start < EXTENT_MAP_LAST_BYTE &&
239 testend && em->start + em->len > start + len) {
240 u64 diff = start + len - em->start;
241
242 split->start = start + len;
243 split->len = em->start + em->len - (start + len);
244 split->bdev = em->bdev;
245 split->flags = flags;
246
247 if (compressed) {
248 split->block_len = em->block_len;
249 split->block_start = em->block_start;
250 split->orig_start = em->orig_start;
251 } else {
252 split->block_len = split->len;
253 split->block_start = em->block_start + diff;
254 split->orig_start = split->start;
255 }
256
257 ret = add_extent_mapping(em_tree, split);
258 BUG_ON(ret);
259 free_extent_map(split);
260 split = NULL;
261 }
262 spin_unlock(&em_tree->lock);
263
264 /* once for us */
265 free_extent_map(em);
266 /* once for the tree*/
267 free_extent_map(em);
268 }
269 if (split)
270 free_extent_map(split);
271 if (split2)
272 free_extent_map(split2);
273 return 0;
274}
275
276int btrfs_check_file(struct btrfs_root *root, struct inode *inode)
277{
278 return 0;
279#if 0
280 struct btrfs_path *path;
281 struct btrfs_key found_key;
282 struct extent_buffer *leaf;
283 struct btrfs_file_extent_item *extent;
284 u64 last_offset = 0;
285 int nritems;
286 int slot;
287 int found_type;
288 int ret;
289 int err = 0;
290 u64 extent_end = 0;
291
292 path = btrfs_alloc_path();
293 ret = btrfs_lookup_file_extent(NULL, root, path, inode->i_ino,
294 last_offset, 0);
295 while (1) {
296 nritems = btrfs_header_nritems(path->nodes[0]);
297 if (path->slots[0] >= nritems) {
298 ret = btrfs_next_leaf(root, path);
299 if (ret)
300 goto out;
301 nritems = btrfs_header_nritems(path->nodes[0]);
302 }
303 slot = path->slots[0];
304 leaf = path->nodes[0];
305 btrfs_item_key_to_cpu(leaf, &found_key, slot);
306 if (found_key.objectid != inode->i_ino)
307 break;
308 if (found_key.type != BTRFS_EXTENT_DATA_KEY)
309 goto out;
310
311 if (found_key.offset < last_offset) {
312 WARN_ON(1);
313 btrfs_print_leaf(root, leaf);
314 printk(KERN_ERR "inode %lu found offset %llu "
315 "expected %llu\n", inode->i_ino,
316 (unsigned long long)found_key.offset,
317 (unsigned long long)last_offset);
318 err = 1;
319 goto out;
320 }
321 extent = btrfs_item_ptr(leaf, slot,
322 struct btrfs_file_extent_item);
323 found_type = btrfs_file_extent_type(leaf, extent);
324 if (found_type == BTRFS_FILE_EXTENT_REG) {
325 extent_end = found_key.offset +
326 btrfs_file_extent_num_bytes(leaf, extent);
327 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
328 struct btrfs_item *item;
329 item = btrfs_item_nr(leaf, slot);
330 extent_end = found_key.offset +
331 btrfs_file_extent_inline_len(leaf, extent);
332 extent_end = (extent_end + root->sectorsize - 1) &
333 ~((u64)root->sectorsize - 1);
334 }
335 last_offset = extent_end;
336 path->slots[0]++;
337 }
338 if (0 && last_offset < inode->i_size) {
339 WARN_ON(1);
340 btrfs_print_leaf(root, leaf);
341 printk(KERN_ERR "inode %lu found offset %llu size %llu\n",
342 inode->i_ino, (unsigned long long)last_offset,
343 (unsigned long long)inode->i_size);
344 err = 1;
345
346 }
347out:
348 btrfs_free_path(path);
349 return err;
350#endif
351}
352
353/*
354 * this is very complex, but the basic idea is to drop all extents
355 * in the range start - end. hint_block is filled in with a block number
356 * that would be a good hint to the block allocator for this file.
357 *
358 * If an extent intersects the range but is not entirely inside the range
359 * it is either truncated or split. Anything entirely inside the range
360 * is deleted from the tree.
361 *
362 * inline_limit is used to tell this code which offsets in the file to keep
363 * if they contain inline extents.
364 */
365noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
366 struct btrfs_root *root, struct inode *inode,
367 u64 start, u64 end, u64 inline_limit, u64 *hint_byte)
368{
369 u64 extent_end = 0;
370 u64 locked_end = end;
371 u64 search_start = start;
372 u64 leaf_start;
373 u64 ram_bytes = 0;
374 u64 orig_parent = 0;
375 u64 disk_bytenr = 0;
376 u8 compression;
377 u8 encryption;
378 u16 other_encoding = 0;
379 u64 root_gen;
380 u64 root_owner;
381 struct extent_buffer *leaf;
382 struct btrfs_file_extent_item *extent;
383 struct btrfs_path *path;
384 struct btrfs_key key;
385 struct btrfs_file_extent_item old;
386 int keep;
387 int slot;
388 int bookend;
389 int found_type = 0;
390 int found_extent;
391 int found_inline;
392 int recow;
393 int ret;
394
395 inline_limit = 0;
396 btrfs_drop_extent_cache(inode, start, end - 1, 0);
397
398 path = btrfs_alloc_path();
399 if (!path)
400 return -ENOMEM;
401 while (1) {
402 recow = 0;
403 btrfs_release_path(root, path);
404 ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
405 search_start, -1);
406 if (ret < 0)
407 goto out;
408 if (ret > 0) {
409 if (path->slots[0] == 0) {
410 ret = 0;
411 goto out;
412 }
413 path->slots[0]--;
414 }
415next_slot:
416 keep = 0;
417 bookend = 0;
418 found_extent = 0;
419 found_inline = 0;
420 leaf_start = 0;
421 root_gen = 0;
422 root_owner = 0;
423 compression = 0;
424 encryption = 0;
425 extent = NULL;
426 leaf = path->nodes[0];
427 slot = path->slots[0];
428 ret = 0;
429 btrfs_item_key_to_cpu(leaf, &key, slot);
430 if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY &&
431 key.offset >= end) {
432 goto out;
433 }
434 if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY ||
435 key.objectid != inode->i_ino) {
436 goto out;
437 }
438 if (recow) {
439 search_start = max(key.offset, start);
440 continue;
441 }
442 if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
443 extent = btrfs_item_ptr(leaf, slot,
444 struct btrfs_file_extent_item);
445 found_type = btrfs_file_extent_type(leaf, extent);
446 compression = btrfs_file_extent_compression(leaf,
447 extent);
448 encryption = btrfs_file_extent_encryption(leaf,
449 extent);
450 other_encoding = btrfs_file_extent_other_encoding(leaf,
451 extent);
452 if (found_type == BTRFS_FILE_EXTENT_REG ||
453 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
454 extent_end =
455 btrfs_file_extent_disk_bytenr(leaf,
456 extent);
457 if (extent_end)
458 *hint_byte = extent_end;
459
460 extent_end = key.offset +
461 btrfs_file_extent_num_bytes(leaf, extent);
462 ram_bytes = btrfs_file_extent_ram_bytes(leaf,
463 extent);
464 found_extent = 1;
465 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
466 found_inline = 1;
467 extent_end = key.offset +
468 btrfs_file_extent_inline_len(leaf, extent);
469 }
470 } else {
471 extent_end = search_start;
472 }
473
474 /* we found nothing we can drop */
475 if ((!found_extent && !found_inline) ||
476 search_start >= extent_end) {
477 int nextret;
478 u32 nritems;
479 nritems = btrfs_header_nritems(leaf);
480 if (slot >= nritems - 1) {
481 nextret = btrfs_next_leaf(root, path);
482 if (nextret)
483 goto out;
484 recow = 1;
485 } else {
486 path->slots[0]++;
487 }
488 goto next_slot;
489 }
490
491 if (end <= extent_end && start >= key.offset && found_inline)
492 *hint_byte = EXTENT_MAP_INLINE;
493
494 if (found_extent) {
495 read_extent_buffer(leaf, &old, (unsigned long)extent,
496 sizeof(old));
497 root_gen = btrfs_header_generation(leaf);
498 root_owner = btrfs_header_owner(leaf);
499 leaf_start = leaf->start;
500 }
501
502 if (end < extent_end && end >= key.offset) {
503 bookend = 1;
504 if (found_inline && start <= key.offset)
505 keep = 1;
506 }
507
508 if (bookend && found_extent) {
509 if (locked_end < extent_end) {
510 ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
511 locked_end, extent_end - 1,
512 GFP_NOFS);
513 if (!ret) {
514 btrfs_release_path(root, path);
515 lock_extent(&BTRFS_I(inode)->io_tree,
516 locked_end, extent_end - 1,
517 GFP_NOFS);
518 locked_end = extent_end;
519 continue;
520 }
521 locked_end = extent_end;
522 }
523 orig_parent = path->nodes[0]->start;
524 disk_bytenr = le64_to_cpu(old.disk_bytenr);
525 if (disk_bytenr != 0) {
526 ret = btrfs_inc_extent_ref(trans, root,
527 disk_bytenr,
528 le64_to_cpu(old.disk_num_bytes),
529 orig_parent, root->root_key.objectid,
530 trans->transid, inode->i_ino);
531 BUG_ON(ret);
532 }
533 }
534
535 if (found_inline) {
536 u64 mask = root->sectorsize - 1;
537 search_start = (extent_end + mask) & ~mask;
538 } else
539 search_start = extent_end;
540
541 /* truncate existing extent */
542 if (start > key.offset) {
543 u64 new_num;
544 u64 old_num;
545 keep = 1;
546 WARN_ON(start & (root->sectorsize - 1));
547 if (found_extent) {
548 new_num = start - key.offset;
549 old_num = btrfs_file_extent_num_bytes(leaf,
550 extent);
551 *hint_byte =
552 btrfs_file_extent_disk_bytenr(leaf,
553 extent);
554 if (btrfs_file_extent_disk_bytenr(leaf,
555 extent)) {
556 inode_sub_bytes(inode, old_num -
557 new_num);
558 }
559 btrfs_set_file_extent_num_bytes(leaf,
560 extent, new_num);
561 btrfs_mark_buffer_dirty(leaf);
562 } else if (key.offset < inline_limit &&
563 (end > extent_end) &&
564 (inline_limit < extent_end)) {
565 u32 new_size;
566 new_size = btrfs_file_extent_calc_inline_size(
567 inline_limit - key.offset);
568 inode_sub_bytes(inode, extent_end -
569 inline_limit);
570 btrfs_set_file_extent_ram_bytes(leaf, extent,
571 new_size);
572 if (!compression && !encryption) {
573 btrfs_truncate_item(trans, root, path,
574 new_size, 1);
575 }
576 }
577 }
578 /* delete the entire extent */
579 if (!keep) {
580 if (found_inline)
581 inode_sub_bytes(inode, extent_end -
582 key.offset);
583 ret = btrfs_del_item(trans, root, path);
584 /* TODO update progress marker and return */
585 BUG_ON(ret);
586 extent = NULL;
587 btrfs_release_path(root, path);
588 /* the extent will be freed later */
589 }
590 if (bookend && found_inline && start <= key.offset) {
591 u32 new_size;
592 new_size = btrfs_file_extent_calc_inline_size(
593 extent_end - end);
594 inode_sub_bytes(inode, end - key.offset);
595 btrfs_set_file_extent_ram_bytes(leaf, extent,
596 new_size);
597 if (!compression && !encryption)
598 ret = btrfs_truncate_item(trans, root, path,
599 new_size, 0);
600 BUG_ON(ret);
601 }
602 /* create bookend, splitting the extent in two */
603 if (bookend && found_extent) {
604 struct btrfs_key ins;
605 ins.objectid = inode->i_ino;
606 ins.offset = end;
607 btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY);
608
609 btrfs_release_path(root, path);
610 ret = btrfs_insert_empty_item(trans, root, path, &ins,
611 sizeof(*extent));
612 BUG_ON(ret);
613
614 leaf = path->nodes[0];
615 extent = btrfs_item_ptr(leaf, path->slots[0],
616 struct btrfs_file_extent_item);
617 write_extent_buffer(leaf, &old,
618 (unsigned long)extent, sizeof(old));
619
620 btrfs_set_file_extent_compression(leaf, extent,
621 compression);
622 btrfs_set_file_extent_encryption(leaf, extent,
623 encryption);
624 btrfs_set_file_extent_other_encoding(leaf, extent,
625 other_encoding);
626 btrfs_set_file_extent_offset(leaf, extent,
627 le64_to_cpu(old.offset) + end - key.offset);
628 WARN_ON(le64_to_cpu(old.num_bytes) <
629 (extent_end - end));
630 btrfs_set_file_extent_num_bytes(leaf, extent,
631 extent_end - end);
632
633 /*
634 * set the ram bytes to the size of the full extent
635 * before splitting. This is a worst case flag,
636 * but its the best we can do because we don't know
637 * how splitting affects compression
638 */
639 btrfs_set_file_extent_ram_bytes(leaf, extent,
640 ram_bytes);
641 btrfs_set_file_extent_type(leaf, extent, found_type);
642
643 btrfs_mark_buffer_dirty(path->nodes[0]);
644
645 if (disk_bytenr != 0) {
646 ret = btrfs_update_extent_ref(trans, root,
647 disk_bytenr, orig_parent,
648 leaf->start,
649 root->root_key.objectid,
650 trans->transid, ins.objectid);
651
652 BUG_ON(ret);
653 }
654 btrfs_release_path(root, path);
655 if (disk_bytenr != 0)
656 inode_add_bytes(inode, extent_end - end);
657 }
658
659 if (found_extent && !keep) {
660 u64 old_disk_bytenr = le64_to_cpu(old.disk_bytenr);
661
662 if (old_disk_bytenr != 0) {
663 inode_sub_bytes(inode,
664 le64_to_cpu(old.num_bytes));
665 ret = btrfs_free_extent(trans, root,
666 old_disk_bytenr,
667 le64_to_cpu(old.disk_num_bytes),
668 leaf_start, root_owner,
669 root_gen, key.objectid, 0);
670 BUG_ON(ret);
671 *hint_byte = old_disk_bytenr;
672 }
673 }
674
675 if (search_start >= end) {
676 ret = 0;
677 goto out;
678 }
679 }
680out:
681 btrfs_free_path(path);
682 if (locked_end > end) {
683 unlock_extent(&BTRFS_I(inode)->io_tree, end, locked_end - 1,
684 GFP_NOFS);
685 }
686 btrfs_check_file(root, inode);
687 return ret;
688}
689
690static int extent_mergeable(struct extent_buffer *leaf, int slot,
691 u64 objectid, u64 bytenr, u64 *start, u64 *end)
692{
693 struct btrfs_file_extent_item *fi;
694 struct btrfs_key key;
695 u64 extent_end;
696
697 if (slot < 0 || slot >= btrfs_header_nritems(leaf))
698 return 0;
699
700 btrfs_item_key_to_cpu(leaf, &key, slot);
701 if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)
702 return 0;
703
704 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
705 if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
706 btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
707 btrfs_file_extent_compression(leaf, fi) ||
708 btrfs_file_extent_encryption(leaf, fi) ||
709 btrfs_file_extent_other_encoding(leaf, fi))
710 return 0;
711
712 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
713 if ((*start && *start != key.offset) || (*end && *end != extent_end))
714 return 0;
715
716 *start = key.offset;
717 *end = extent_end;
718 return 1;
719}
720
721/*
722 * Mark extent in the range start - end as written.
723 *
724 * This changes extent type from 'pre-allocated' to 'regular'. If only
725 * part of extent is marked as written, the extent will be split into
726 * two or three.
727 */
728int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
729 struct btrfs_root *root,
730 struct inode *inode, u64 start, u64 end)
731{
732 struct extent_buffer *leaf;
733 struct btrfs_path *path;
734 struct btrfs_file_extent_item *fi;
735 struct btrfs_key key;
736 u64 bytenr;
737 u64 num_bytes;
738 u64 extent_end;
739 u64 extent_offset;
740 u64 other_start;
741 u64 other_end;
742 u64 split = start;
743 u64 locked_end = end;
744 u64 orig_parent;
745 int extent_type;
746 int split_end = 1;
747 int ret;
748
749 btrfs_drop_extent_cache(inode, start, end - 1, 0);
750
751 path = btrfs_alloc_path();
752 BUG_ON(!path);
753again:
754 key.objectid = inode->i_ino;
755 key.type = BTRFS_EXTENT_DATA_KEY;
756 if (split == start)
757 key.offset = split;
758 else
759 key.offset = split - 1;
760
761 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
762 if (ret > 0 && path->slots[0] > 0)
763 path->slots[0]--;
764
765 leaf = path->nodes[0];
766 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
767 BUG_ON(key.objectid != inode->i_ino ||
768 key.type != BTRFS_EXTENT_DATA_KEY);
769 fi = btrfs_item_ptr(leaf, path->slots[0],
770 struct btrfs_file_extent_item);
771 extent_type = btrfs_file_extent_type(leaf, fi);
772 BUG_ON(extent_type != BTRFS_FILE_EXTENT_PREALLOC);
773 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
774 BUG_ON(key.offset > start || extent_end < end);
775
776 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
777 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
778 extent_offset = btrfs_file_extent_offset(leaf, fi);
779
780 if (key.offset == start)
781 split = end;
782
783 if (key.offset == start && extent_end == end) {
784 int del_nr = 0;
785 int del_slot = 0;
786 u64 leaf_owner = btrfs_header_owner(leaf);
787 u64 leaf_gen = btrfs_header_generation(leaf);
788 other_start = end;
789 other_end = 0;
790 if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino,
791 bytenr, &other_start, &other_end)) {
792 extent_end = other_end;
793 del_slot = path->slots[0] + 1;
794 del_nr++;
795 ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
796 leaf->start, leaf_owner,
797 leaf_gen, inode->i_ino, 0);
798 BUG_ON(ret);
799 }
800 other_start = 0;
801 other_end = start;
802 if (extent_mergeable(leaf, path->slots[0] - 1, inode->i_ino,
803 bytenr, &other_start, &other_end)) {
804 key.offset = other_start;
805 del_slot = path->slots[0];
806 del_nr++;
807 ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
808 leaf->start, leaf_owner,
809 leaf_gen, inode->i_ino, 0);
810 BUG_ON(ret);
811 }
812 split_end = 0;
813 if (del_nr == 0) {
814 btrfs_set_file_extent_type(leaf, fi,
815 BTRFS_FILE_EXTENT_REG);
816 goto done;
817 }
818
819 fi = btrfs_item_ptr(leaf, del_slot - 1,
820 struct btrfs_file_extent_item);
821 btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG);
822 btrfs_set_file_extent_num_bytes(leaf, fi,
823 extent_end - key.offset);
824 btrfs_mark_buffer_dirty(leaf);
825
826 ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
827 BUG_ON(ret);
828 goto done;
829 } else if (split == start) {
830 if (locked_end < extent_end) {
831 ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
832 locked_end, extent_end - 1, GFP_NOFS);
833 if (!ret) {
834 btrfs_release_path(root, path);
835 lock_extent(&BTRFS_I(inode)->io_tree,
836 locked_end, extent_end - 1, GFP_NOFS);
837 locked_end = extent_end;
838 goto again;
839 }
840 locked_end = extent_end;
841 }
842 btrfs_set_file_extent_num_bytes(leaf, fi, split - key.offset);
843 extent_offset += split - key.offset;
844 } else {
845 BUG_ON(key.offset != start);
846 btrfs_set_file_extent_offset(leaf, fi, extent_offset +
847 split - key.offset);
848 btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split);
849 key.offset = split;
850 btrfs_set_item_key_safe(trans, root, path, &key);
851 extent_end = split;
852 }
853
854 if (extent_end == end) {
855 split_end = 0;
856 extent_type = BTRFS_FILE_EXTENT_REG;
857 }
858 if (extent_end == end && split == start) {
859 other_start = end;
860 other_end = 0;
861 if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino,
862 bytenr, &other_start, &other_end)) {
863 path->slots[0]++;
864 fi = btrfs_item_ptr(leaf, path->slots[0],
865 struct btrfs_file_extent_item);
866 key.offset = split;
867 btrfs_set_item_key_safe(trans, root, path, &key);
868 btrfs_set_file_extent_offset(leaf, fi, extent_offset);
869 btrfs_set_file_extent_num_bytes(leaf, fi,
870 other_end - split);
871 goto done;
872 }
873 }
874 if (extent_end == end && split == end) {
875 other_start = 0;
876 other_end = start;
877 if (extent_mergeable(leaf, path->slots[0] - 1 , inode->i_ino,
878 bytenr, &other_start, &other_end)) {
879 path->slots[0]--;
880 fi = btrfs_item_ptr(leaf, path->slots[0],
881 struct btrfs_file_extent_item);
882 btrfs_set_file_extent_num_bytes(leaf, fi, extent_end -
883 other_start);
884 goto done;
885 }
886 }
887
888 btrfs_mark_buffer_dirty(leaf);
889
890 orig_parent = leaf->start;
891 ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes,
892 orig_parent, root->root_key.objectid,
893 trans->transid, inode->i_ino);
894 BUG_ON(ret);
895 btrfs_release_path(root, path);
896
897 key.offset = start;
898 ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*fi));
899 BUG_ON(ret);
900
901 leaf = path->nodes[0];
902 fi = btrfs_item_ptr(leaf, path->slots[0],
903 struct btrfs_file_extent_item);
904 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
905 btrfs_set_file_extent_type(leaf, fi, extent_type);
906 btrfs_set_file_extent_disk_bytenr(leaf, fi, bytenr);
907 btrfs_set_file_extent_disk_num_bytes(leaf, fi, num_bytes);
908 btrfs_set_file_extent_offset(leaf, fi, extent_offset);
909 btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - key.offset);
910 btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
911 btrfs_set_file_extent_compression(leaf, fi, 0);
912 btrfs_set_file_extent_encryption(leaf, fi, 0);
913 btrfs_set_file_extent_other_encoding(leaf, fi, 0);
914
915 if (orig_parent != leaf->start) {
916 ret = btrfs_update_extent_ref(trans, root, bytenr,
917 orig_parent, leaf->start,
918 root->root_key.objectid,
919 trans->transid, inode->i_ino);
920 BUG_ON(ret);
921 }
922done:
923 btrfs_mark_buffer_dirty(leaf);
924 btrfs_release_path(root, path);
925 if (split_end && split == start) {
926 split = end;
927 goto again;
928 }
929 if (locked_end > end) {
930 unlock_extent(&BTRFS_I(inode)->io_tree, end, locked_end - 1,
931 GFP_NOFS);
932 }
933 btrfs_free_path(path);
934 return 0;
935}
936
937/*
938 * this gets pages into the page cache and locks them down, it also properly
939 * waits for data=ordered extents to finish before allowing the pages to be
940 * modified.
941 */
942static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
943 struct page **pages, size_t num_pages,
944 loff_t pos, unsigned long first_index,
945 unsigned long last_index, size_t write_bytes)
946{
947 int i;
948 unsigned long index = pos >> PAGE_CACHE_SHIFT;
949 struct inode *inode = fdentry(file)->d_inode;
950 int err = 0;
951 u64 start_pos;
952 u64 last_pos;
953
954 start_pos = pos & ~((u64)root->sectorsize - 1);
955 last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT;
956
957 if (start_pos > inode->i_size) {
958 err = btrfs_cont_expand(inode, start_pos);
959 if (err)
960 return err;
961 }
962
963 memset(pages, 0, num_pages * sizeof(struct page *));
964again:
965 for (i = 0; i < num_pages; i++) {
966 pages[i] = grab_cache_page(inode->i_mapping, index + i);
967 if (!pages[i]) {
968 err = -ENOMEM;
969 BUG_ON(1);
970 }
971 wait_on_page_writeback(pages[i]);
972 }
973 if (start_pos < inode->i_size) {
974 struct btrfs_ordered_extent *ordered;
975 lock_extent(&BTRFS_I(inode)->io_tree,
976 start_pos, last_pos - 1, GFP_NOFS);
977 ordered = btrfs_lookup_first_ordered_extent(inode,
978 last_pos - 1);
979 if (ordered &&
980 ordered->file_offset + ordered->len > start_pos &&
981 ordered->file_offset < last_pos) {
982 btrfs_put_ordered_extent(ordered);
983 unlock_extent(&BTRFS_I(inode)->io_tree,
984 start_pos, last_pos - 1, GFP_NOFS);
985 for (i = 0; i < num_pages; i++) {
986 unlock_page(pages[i]);
987 page_cache_release(pages[i]);
988 }
989 btrfs_wait_ordered_range(inode, start_pos,
990 last_pos - start_pos);
991 goto again;
992 }
993 if (ordered)
994 btrfs_put_ordered_extent(ordered);
995
996 clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos,
997 last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC,
998 GFP_NOFS);
999 unlock_extent(&BTRFS_I(inode)->io_tree,
1000 start_pos, last_pos - 1, GFP_NOFS);
1001 }
1002 for (i = 0; i < num_pages; i++) {
1003 clear_page_dirty_for_io(pages[i]);
1004 set_page_extent_mapped(pages[i]);
1005 WARN_ON(!PageLocked(pages[i]));
1006 }
1007 return 0;
1008}
1009
1010static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
1011 size_t count, loff_t *ppos)
1012{
1013 loff_t pos;
1014 loff_t start_pos;
1015 ssize_t num_written = 0;
1016 ssize_t err = 0;
1017 int ret = 0;
1018 struct inode *inode = fdentry(file)->d_inode;
1019 struct btrfs_root *root = BTRFS_I(inode)->root;
1020 struct page **pages = NULL;
1021 int nrptrs;
1022 struct page *pinned[2];
1023 unsigned long first_index;
1024 unsigned long last_index;
1025 int will_write;
1026
1027 will_write = ((file->f_flags & O_SYNC) || IS_SYNC(inode) ||
1028 (file->f_flags & O_DIRECT));
1029
1030 nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE,
1031 PAGE_CACHE_SIZE / (sizeof(struct page *)));
1032 pinned[0] = NULL;
1033 pinned[1] = NULL;
1034
1035 pos = *ppos;
1036 start_pos = pos;
1037
1038 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
1039 current->backing_dev_info = inode->i_mapping->backing_dev_info;
1040 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
1041 if (err)
1042 goto out_nolock;
1043 if (count == 0)
1044 goto out_nolock;
1045
1046 err = file_remove_suid(file);
1047 if (err)
1048 goto out_nolock;
1049 file_update_time(file);
1050
1051 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
1052
1053 mutex_lock(&inode->i_mutex);
1054 BTRFS_I(inode)->sequence++;
1055 first_index = pos >> PAGE_CACHE_SHIFT;
1056 last_index = (pos + count) >> PAGE_CACHE_SHIFT;
1057
1058 /*
1059 * there are lots of better ways to do this, but this code
1060 * makes sure the first and last page in the file range are
1061 * up to date and ready for cow
1062 */
1063 if ((pos & (PAGE_CACHE_SIZE - 1))) {
1064 pinned[0] = grab_cache_page(inode->i_mapping, first_index);
1065 if (!PageUptodate(pinned[0])) {
1066 ret = btrfs_readpage(NULL, pinned[0]);
1067 BUG_ON(ret);
1068 wait_on_page_locked(pinned[0]);
1069 } else {
1070 unlock_page(pinned[0]);
1071 }
1072 }
1073 if ((pos + count) & (PAGE_CACHE_SIZE - 1)) {
1074 pinned[1] = grab_cache_page(inode->i_mapping, last_index);
1075 if (!PageUptodate(pinned[1])) {
1076 ret = btrfs_readpage(NULL, pinned[1]);
1077 BUG_ON(ret);
1078 wait_on_page_locked(pinned[1]);
1079 } else {
1080 unlock_page(pinned[1]);
1081 }
1082 }
1083
1084 while (count > 0) {
1085 size_t offset = pos & (PAGE_CACHE_SIZE - 1);
1086 size_t write_bytes = min(count, nrptrs *
1087 (size_t)PAGE_CACHE_SIZE -
1088 offset);
1089 size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
1090 PAGE_CACHE_SHIFT;
1091
1092 WARN_ON(num_pages > nrptrs);
1093 memset(pages, 0, sizeof(struct page *) * nrptrs);
1094
1095 ret = btrfs_check_free_space(root, write_bytes, 0);
1096 if (ret)
1097 goto out;
1098
1099 ret = prepare_pages(root, file, pages, num_pages,
1100 pos, first_index, last_index,
1101 write_bytes);
1102 if (ret)
1103 goto out;
1104
1105 ret = btrfs_copy_from_user(pos, num_pages,
1106 write_bytes, pages, buf);
1107 if (ret) {
1108 btrfs_drop_pages(pages, num_pages);
1109 goto out;
1110 }
1111
1112 ret = dirty_and_release_pages(NULL, root, file, pages,
1113 num_pages, pos, write_bytes);
1114 btrfs_drop_pages(pages, num_pages);
1115 if (ret)
1116 goto out;
1117
1118 if (will_write) {
1119 btrfs_fdatawrite_range(inode->i_mapping, pos,
1120 pos + write_bytes - 1,
1121 WB_SYNC_NONE);
1122 } else {
1123 balance_dirty_pages_ratelimited_nr(inode->i_mapping,
1124 num_pages);
1125 if (num_pages <
1126 (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
1127 btrfs_btree_balance_dirty(root, 1);
1128 btrfs_throttle(root);
1129 }
1130
1131 buf += write_bytes;
1132 count -= write_bytes;
1133 pos += write_bytes;
1134 num_written += write_bytes;
1135
1136 cond_resched();
1137 }
1138out:
1139 mutex_unlock(&inode->i_mutex);
1140
1141out_nolock:
1142 kfree(pages);
1143 if (pinned[0])
1144 page_cache_release(pinned[0]);
1145 if (pinned[1])
1146 page_cache_release(pinned[1]);
1147 *ppos = pos;
1148
1149 if (num_written > 0 && will_write) {
1150 struct btrfs_trans_handle *trans;
1151
1152 err = btrfs_wait_ordered_range(inode, start_pos, num_written);
1153 if (err)
1154 num_written = err;
1155
1156 if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
1157 trans = btrfs_start_transaction(root, 1);
1158 ret = btrfs_log_dentry_safe(trans, root,
1159 file->f_dentry);
1160 if (ret == 0) {
1161 btrfs_sync_log(trans, root);
1162 btrfs_end_transaction(trans, root);
1163 } else {
1164 btrfs_commit_transaction(trans, root);
1165 }
1166 }
1167 if (file->f_flags & O_DIRECT) {
1168 invalidate_mapping_pages(inode->i_mapping,
1169 start_pos >> PAGE_CACHE_SHIFT,
1170 (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
1171 }
1172 }
1173 current->backing_dev_info = NULL;
1174 return num_written ? num_written : err;
1175}
1176
1177int btrfs_release_file(struct inode *inode, struct file *filp)
1178{
1179 if (filp->private_data)
1180 btrfs_ioctl_trans_end(filp);
1181 return 0;
1182}
1183
1184/*
1185 * fsync call for both files and directories. This logs the inode into
1186 * the tree log instead of forcing full commits whenever possible.
1187 *
1188 * It needs to call filemap_fdatawait so that all ordered extent updates are
1189 * in the metadata btree are up to date for copying to the log.
1190 *
1191 * It drops the inode mutex before doing the tree log commit. This is an
1192 * important optimization for directories because holding the mutex prevents
1193 * new operations on the dir while we write to disk.
1194 */
1195int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1196{
1197 struct inode *inode = dentry->d_inode;
1198 struct btrfs_root *root = BTRFS_I(inode)->root;
1199 int ret = 0;
1200 struct btrfs_trans_handle *trans;
1201
1202 /*
1203 * check the transaction that last modified this inode
1204 * and see if its already been committed
1205 */
1206 if (!BTRFS_I(inode)->last_trans)
1207 goto out;
1208
1209 mutex_lock(&root->fs_info->trans_mutex);
1210 if (BTRFS_I(inode)->last_trans <=
1211 root->fs_info->last_trans_committed) {
1212 BTRFS_I(inode)->last_trans = 0;
1213 mutex_unlock(&root->fs_info->trans_mutex);
1214 goto out;
1215 }
1216 mutex_unlock(&root->fs_info->trans_mutex);
1217
1218 root->fs_info->tree_log_batch++;
1219 filemap_fdatawrite(inode->i_mapping);
1220 btrfs_wait_ordered_range(inode, 0, (u64)-1);
1221 root->fs_info->tree_log_batch++;
1222
1223 /*
1224 * ok we haven't committed the transaction yet, lets do a commit
1225 */
1226 if (file->private_data)
1227 btrfs_ioctl_trans_end(file);
1228
1229 trans = btrfs_start_transaction(root, 1);
1230 if (!trans) {
1231 ret = -ENOMEM;
1232 goto out;
1233 }
1234
1235 ret = btrfs_log_dentry_safe(trans, root, file->f_dentry);
1236 if (ret < 0)
1237 goto out;
1238
1239 /* we've logged all the items and now have a consistent
1240 * version of the file in the log. It is possible that
1241 * someone will come in and modify the file, but that's
1242 * fine because the log is consistent on disk, and we
1243 * have references to all of the file's extents
1244 *
1245 * It is possible that someone will come in and log the
1246 * file again, but that will end up using the synchronization
1247 * inside btrfs_sync_log to keep things safe.
1248 */
1249 mutex_unlock(&file->f_dentry->d_inode->i_mutex);
1250
1251 if (ret > 0) {
1252 ret = btrfs_commit_transaction(trans, root);
1253 } else {
1254 btrfs_sync_log(trans, root);
1255 ret = btrfs_end_transaction(trans, root);
1256 }
1257 mutex_lock(&file->f_dentry->d_inode->i_mutex);
1258out:
1259 return ret > 0 ? EIO : ret;
1260}
1261
1262static struct vm_operations_struct btrfs_file_vm_ops = {
1263 .fault = filemap_fault,
1264 .page_mkwrite = btrfs_page_mkwrite,
1265};
1266
1267static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
1268{
1269 vma->vm_ops = &btrfs_file_vm_ops;
1270 file_accessed(filp);
1271 return 0;
1272}
1273
1274struct file_operations btrfs_file_operations = {
1275 .llseek = generic_file_llseek,
1276 .read = do_sync_read,
1277 .aio_read = generic_file_aio_read,
1278 .splice_read = generic_file_splice_read,
1279 .write = btrfs_file_write,
1280 .mmap = btrfs_file_mmap,
1281 .open = generic_file_open,
1282 .release = btrfs_release_file,
1283 .fsync = btrfs_sync_file,
1284 .unlocked_ioctl = btrfs_ioctl,
1285#ifdef CONFIG_COMPAT
1286 .compat_ioctl = btrfs_ioctl,
1287#endif
1288};
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
new file mode 100644
index 000000000000..d1e5f0e84c58
--- /dev/null
+++ b/fs/btrfs/free-space-cache.c
@@ -0,0 +1,495 @@
1/*
2 * Copyright (C) 2008 Red Hat. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include "ctree.h"
21
22static int tree_insert_offset(struct rb_root *root, u64 offset,
23 struct rb_node *node)
24{
25 struct rb_node **p = &root->rb_node;
26 struct rb_node *parent = NULL;
27 struct btrfs_free_space *info;
28
29 while (*p) {
30 parent = *p;
31 info = rb_entry(parent, struct btrfs_free_space, offset_index);
32
33 if (offset < info->offset)
34 p = &(*p)->rb_left;
35 else if (offset > info->offset)
36 p = &(*p)->rb_right;
37 else
38 return -EEXIST;
39 }
40
41 rb_link_node(node, parent, p);
42 rb_insert_color(node, root);
43
44 return 0;
45}
46
47static int tree_insert_bytes(struct rb_root *root, u64 bytes,
48 struct rb_node *node)
49{
50 struct rb_node **p = &root->rb_node;
51 struct rb_node *parent = NULL;
52 struct btrfs_free_space *info;
53
54 while (*p) {
55 parent = *p;
56 info = rb_entry(parent, struct btrfs_free_space, bytes_index);
57
58 if (bytes < info->bytes)
59 p = &(*p)->rb_left;
60 else
61 p = &(*p)->rb_right;
62 }
63
64 rb_link_node(node, parent, p);
65 rb_insert_color(node, root);
66
67 return 0;
68}
69
70/*
71 * searches the tree for the given offset. If contains is set we will return
72 * the free space that contains the given offset. If contains is not set we
73 * will return the free space that starts at or after the given offset and is
74 * at least bytes long.
75 */
76static struct btrfs_free_space *tree_search_offset(struct rb_root *root,
77 u64 offset, u64 bytes,
78 int contains)
79{
80 struct rb_node *n = root->rb_node;
81 struct btrfs_free_space *entry, *ret = NULL;
82
83 while (n) {
84 entry = rb_entry(n, struct btrfs_free_space, offset_index);
85
86 if (offset < entry->offset) {
87 if (!contains &&
88 (!ret || entry->offset < ret->offset) &&
89 (bytes <= entry->bytes))
90 ret = entry;
91 n = n->rb_left;
92 } else if (offset > entry->offset) {
93 if ((entry->offset + entry->bytes - 1) >= offset &&
94 bytes <= entry->bytes) {
95 ret = entry;
96 break;
97 }
98 n = n->rb_right;
99 } else {
100 if (bytes > entry->bytes) {
101 n = n->rb_right;
102 continue;
103 }
104 ret = entry;
105 break;
106 }
107 }
108
109 return ret;
110}
111
112/*
113 * return a chunk at least bytes size, as close to offset that we can get.
114 */
115static struct btrfs_free_space *tree_search_bytes(struct rb_root *root,
116 u64 offset, u64 bytes)
117{
118 struct rb_node *n = root->rb_node;
119 struct btrfs_free_space *entry, *ret = NULL;
120
121 while (n) {
122 entry = rb_entry(n, struct btrfs_free_space, bytes_index);
123
124 if (bytes < entry->bytes) {
125 /*
126 * We prefer to get a hole size as close to the size we
127 * are asking for so we don't take small slivers out of
128 * huge holes, but we also want to get as close to the
129 * offset as possible so we don't have a whole lot of
130 * fragmentation.
131 */
132 if (offset <= entry->offset) {
133 if (!ret)
134 ret = entry;
135 else if (entry->bytes < ret->bytes)
136 ret = entry;
137 else if (entry->offset < ret->offset)
138 ret = entry;
139 }
140 n = n->rb_left;
141 } else if (bytes > entry->bytes) {
142 n = n->rb_right;
143 } else {
144 /*
145 * Ok we may have multiple chunks of the wanted size,
146 * so we don't want to take the first one we find, we
147 * want to take the one closest to our given offset, so
148 * keep searching just in case theres a better match.
149 */
150 n = n->rb_right;
151 if (offset > entry->offset)
152 continue;
153 else if (!ret || entry->offset < ret->offset)
154 ret = entry;
155 }
156 }
157
158 return ret;
159}
160
161static void unlink_free_space(struct btrfs_block_group_cache *block_group,
162 struct btrfs_free_space *info)
163{
164 rb_erase(&info->offset_index, &block_group->free_space_offset);
165 rb_erase(&info->bytes_index, &block_group->free_space_bytes);
166}
167
168static int link_free_space(struct btrfs_block_group_cache *block_group,
169 struct btrfs_free_space *info)
170{
171 int ret = 0;
172
173
174 ret = tree_insert_offset(&block_group->free_space_offset, info->offset,
175 &info->offset_index);
176 if (ret)
177 return ret;
178
179 ret = tree_insert_bytes(&block_group->free_space_bytes, info->bytes,
180 &info->bytes_index);
181 if (ret)
182 return ret;
183
184 return ret;
185}
186
187static int __btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
188 u64 offset, u64 bytes)
189{
190 struct btrfs_free_space *right_info;
191 struct btrfs_free_space *left_info;
192 struct btrfs_free_space *info = NULL;
193 struct btrfs_free_space *alloc_info;
194 int ret = 0;
195
196 alloc_info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS);
197 if (!alloc_info)
198 return -ENOMEM;
199
200 /*
201 * first we want to see if there is free space adjacent to the range we
202 * are adding, if there is remove that struct and add a new one to
203 * cover the entire range
204 */
205 right_info = tree_search_offset(&block_group->free_space_offset,
206 offset+bytes, 0, 1);
207 left_info = tree_search_offset(&block_group->free_space_offset,
208 offset-1, 0, 1);
209
210 if (right_info && right_info->offset == offset+bytes) {
211 unlink_free_space(block_group, right_info);
212 info = right_info;
213 info->offset = offset;
214 info->bytes += bytes;
215 } else if (right_info && right_info->offset != offset+bytes) {
216 printk(KERN_ERR "btrfs adding space in the middle of an "
217 "existing free space area. existing: "
218 "offset=%llu, bytes=%llu. new: offset=%llu, "
219 "bytes=%llu\n", (unsigned long long)right_info->offset,
220 (unsigned long long)right_info->bytes,
221 (unsigned long long)offset,
222 (unsigned long long)bytes);
223 BUG();
224 }
225
226 if (left_info) {
227 unlink_free_space(block_group, left_info);
228
229 if (unlikely((left_info->offset + left_info->bytes) !=
230 offset)) {
231 printk(KERN_ERR "btrfs free space to the left "
232 "of new free space isn't "
233 "quite right. existing: offset=%llu, "
234 "bytes=%llu. new: offset=%llu, bytes=%llu\n",
235 (unsigned long long)left_info->offset,
236 (unsigned long long)left_info->bytes,
237 (unsigned long long)offset,
238 (unsigned long long)bytes);
239 BUG();
240 }
241
242 if (info) {
243 info->offset = left_info->offset;
244 info->bytes += left_info->bytes;
245 kfree(left_info);
246 } else {
247 info = left_info;
248 info->bytes += bytes;
249 }
250 }
251
252 if (info) {
253 ret = link_free_space(block_group, info);
254 if (!ret)
255 info = NULL;
256 goto out;
257 }
258
259 info = alloc_info;
260 alloc_info = NULL;
261 info->offset = offset;
262 info->bytes = bytes;
263
264 ret = link_free_space(block_group, info);
265 if (ret)
266 kfree(info);
267out:
268 if (ret) {
269 printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret);
270 if (ret == -EEXIST)
271 BUG();
272 }
273
274 kfree(alloc_info);
275
276 return ret;
277}
278
279static int
280__btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
281 u64 offset, u64 bytes)
282{
283 struct btrfs_free_space *info;
284 int ret = 0;
285
286 info = tree_search_offset(&block_group->free_space_offset, offset, 0,
287 1);
288
289 if (info && info->offset == offset) {
290 if (info->bytes < bytes) {
291 printk(KERN_ERR "Found free space at %llu, size %llu,"
292 "trying to use %llu\n",
293 (unsigned long long)info->offset,
294 (unsigned long long)info->bytes,
295 (unsigned long long)bytes);
296 WARN_ON(1);
297 ret = -EINVAL;
298 goto out;
299 }
300 unlink_free_space(block_group, info);
301
302 if (info->bytes == bytes) {
303 kfree(info);
304 goto out;
305 }
306
307 info->offset += bytes;
308 info->bytes -= bytes;
309
310 ret = link_free_space(block_group, info);
311 BUG_ON(ret);
312 } else if (info && info->offset < offset &&
313 info->offset + info->bytes >= offset + bytes) {
314 u64 old_start = info->offset;
315 /*
316 * we're freeing space in the middle of the info,
317 * this can happen during tree log replay
318 *
319 * first unlink the old info and then
320 * insert it again after the hole we're creating
321 */
322 unlink_free_space(block_group, info);
323 if (offset + bytes < info->offset + info->bytes) {
324 u64 old_end = info->offset + info->bytes;
325
326 info->offset = offset + bytes;
327 info->bytes = old_end - info->offset;
328 ret = link_free_space(block_group, info);
329 BUG_ON(ret);
330 } else {
331 /* the hole we're creating ends at the end
332 * of the info struct, just free the info
333 */
334 kfree(info);
335 }
336
337 /* step two, insert a new info struct to cover anything
338 * before the hole
339 */
340 ret = __btrfs_add_free_space(block_group, old_start,
341 offset - old_start);
342 BUG_ON(ret);
343 } else {
344 WARN_ON(1);
345 }
346out:
347 return ret;
348}
349
350int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
351 u64 offset, u64 bytes)
352{
353 int ret;
354 struct btrfs_free_space *sp;
355
356 mutex_lock(&block_group->alloc_mutex);
357 ret = __btrfs_add_free_space(block_group, offset, bytes);
358 sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1);
359 BUG_ON(!sp);
360 mutex_unlock(&block_group->alloc_mutex);
361
362 return ret;
363}
364
365int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group,
366 u64 offset, u64 bytes)
367{
368 int ret;
369 struct btrfs_free_space *sp;
370
371 ret = __btrfs_add_free_space(block_group, offset, bytes);
372 sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1);
373 BUG_ON(!sp);
374
375 return ret;
376}
377
378int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
379 u64 offset, u64 bytes)
380{
381 int ret = 0;
382
383 mutex_lock(&block_group->alloc_mutex);
384 ret = __btrfs_remove_free_space(block_group, offset, bytes);
385 mutex_unlock(&block_group->alloc_mutex);
386
387 return ret;
388}
389
390int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group,
391 u64 offset, u64 bytes)
392{
393 int ret;
394
395 ret = __btrfs_remove_free_space(block_group, offset, bytes);
396
397 return ret;
398}
399
400void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
401 u64 bytes)
402{
403 struct btrfs_free_space *info;
404 struct rb_node *n;
405 int count = 0;
406
407 for (n = rb_first(&block_group->free_space_offset); n; n = rb_next(n)) {
408 info = rb_entry(n, struct btrfs_free_space, offset_index);
409 if (info->bytes >= bytes)
410 count++;
411 }
412 printk(KERN_INFO "%d blocks of free space at or bigger than bytes is"
413 "\n", count);
414}
415
416u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group)
417{
418 struct btrfs_free_space *info;
419 struct rb_node *n;
420 u64 ret = 0;
421
422 for (n = rb_first(&block_group->free_space_offset); n;
423 n = rb_next(n)) {
424 info = rb_entry(n, struct btrfs_free_space, offset_index);
425 ret += info->bytes;
426 }
427
428 return ret;
429}
430
431void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
432{
433 struct btrfs_free_space *info;
434 struct rb_node *node;
435
436 mutex_lock(&block_group->alloc_mutex);
437 while ((node = rb_last(&block_group->free_space_bytes)) != NULL) {
438 info = rb_entry(node, struct btrfs_free_space, bytes_index);
439 unlink_free_space(block_group, info);
440 kfree(info);
441 if (need_resched()) {
442 mutex_unlock(&block_group->alloc_mutex);
443 cond_resched();
444 mutex_lock(&block_group->alloc_mutex);
445 }
446 }
447 mutex_unlock(&block_group->alloc_mutex);
448}
449
450#if 0
451static struct btrfs_free_space *btrfs_find_free_space_offset(struct
452 btrfs_block_group_cache
453 *block_group, u64 offset,
454 u64 bytes)
455{
456 struct btrfs_free_space *ret;
457
458 mutex_lock(&block_group->alloc_mutex);
459 ret = tree_search_offset(&block_group->free_space_offset, offset,
460 bytes, 0);
461 mutex_unlock(&block_group->alloc_mutex);
462
463 return ret;
464}
465
466static struct btrfs_free_space *btrfs_find_free_space_bytes(struct
467 btrfs_block_group_cache
468 *block_group, u64 offset,
469 u64 bytes)
470{
471 struct btrfs_free_space *ret;
472
473 mutex_lock(&block_group->alloc_mutex);
474
475 ret = tree_search_bytes(&block_group->free_space_bytes, offset, bytes);
476 mutex_unlock(&block_group->alloc_mutex);
477
478 return ret;
479}
480#endif
481
482struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache
483 *block_group, u64 offset,
484 u64 bytes)
485{
486 struct btrfs_free_space *ret = NULL;
487
488 ret = tree_search_offset(&block_group->free_space_offset, offset,
489 bytes, 0);
490 if (!ret)
491 ret = tree_search_bytes(&block_group->free_space_bytes,
492 offset, bytes);
493
494 return ret;
495}
diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h
new file mode 100644
index 000000000000..2a020b276768
--- /dev/null
+++ b/fs/btrfs/hash.h
@@ -0,0 +1,27 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __HASH__
20#define __HASH__
21
22#include "crc32c.h"
23static inline u64 btrfs_name_hash(const char *name, int len)
24{
25 return btrfs_crc32c((u32)~1, name, len);
26}
27#endif
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
new file mode 100644
index 000000000000..3d46fa1f29a4
--- /dev/null
+++ b/fs/btrfs/inode-item.c
@@ -0,0 +1,206 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include "ctree.h"
20#include "disk-io.h"
21#include "transaction.h"
22
23static int find_name_in_backref(struct btrfs_path *path, const char *name,
24 int name_len, struct btrfs_inode_ref **ref_ret)
25{
26 struct extent_buffer *leaf;
27 struct btrfs_inode_ref *ref;
28 unsigned long ptr;
29 unsigned long name_ptr;
30 u32 item_size;
31 u32 cur_offset = 0;
32 int len;
33
34 leaf = path->nodes[0];
35 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
36 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
37 while (cur_offset < item_size) {
38 ref = (struct btrfs_inode_ref *)(ptr + cur_offset);
39 len = btrfs_inode_ref_name_len(leaf, ref);
40 name_ptr = (unsigned long)(ref + 1);
41 cur_offset += len + sizeof(*ref);
42 if (len != name_len)
43 continue;
44 if (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0) {
45 *ref_ret = ref;
46 return 1;
47 }
48 }
49 return 0;
50}
51
52int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
53 struct btrfs_root *root,
54 const char *name, int name_len,
55 u64 inode_objectid, u64 ref_objectid, u64 *index)
56{
57 struct btrfs_path *path;
58 struct btrfs_key key;
59 struct btrfs_inode_ref *ref;
60 struct extent_buffer *leaf;
61 unsigned long ptr;
62 unsigned long item_start;
63 u32 item_size;
64 u32 sub_item_len;
65 int ret;
66 int del_len = name_len + sizeof(*ref);
67
68 key.objectid = inode_objectid;
69 key.offset = ref_objectid;
70 btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
71
72 path = btrfs_alloc_path();
73 if (!path)
74 return -ENOMEM;
75
76 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
77 if (ret > 0) {
78 ret = -ENOENT;
79 goto out;
80 } else if (ret < 0) {
81 goto out;
82 }
83 if (!find_name_in_backref(path, name, name_len, &ref)) {
84 ret = -ENOENT;
85 goto out;
86 }
87 leaf = path->nodes[0];
88 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
89
90 if (index)
91 *index = btrfs_inode_ref_index(leaf, ref);
92
93 if (del_len == item_size) {
94 ret = btrfs_del_item(trans, root, path);
95 goto out;
96 }
97 ptr = (unsigned long)ref;
98 sub_item_len = name_len + sizeof(*ref);
99 item_start = btrfs_item_ptr_offset(leaf, path->slots[0]);
100 memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
101 item_size - (ptr + sub_item_len - item_start));
102 ret = btrfs_truncate_item(trans, root, path,
103 item_size - sub_item_len, 1);
104 BUG_ON(ret);
105out:
106 btrfs_free_path(path);
107 return ret;
108}
109
110int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
111 struct btrfs_root *root,
112 const char *name, int name_len,
113 u64 inode_objectid, u64 ref_objectid, u64 index)
114{
115 struct btrfs_path *path;
116 struct btrfs_key key;
117 struct btrfs_inode_ref *ref;
118 unsigned long ptr;
119 int ret;
120 int ins_len = name_len + sizeof(*ref);
121
122 key.objectid = inode_objectid;
123 key.offset = ref_objectid;
124 btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
125
126 path = btrfs_alloc_path();
127 if (!path)
128 return -ENOMEM;
129
130 ret = btrfs_insert_empty_item(trans, root, path, &key,
131 ins_len);
132 if (ret == -EEXIST) {
133 u32 old_size;
134
135 if (find_name_in_backref(path, name, name_len, &ref))
136 goto out;
137
138 old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
139 ret = btrfs_extend_item(trans, root, path, ins_len);
140 BUG_ON(ret);
141 ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
142 struct btrfs_inode_ref);
143 ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size);
144 btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
145 btrfs_set_inode_ref_index(path->nodes[0], ref, index);
146 ptr = (unsigned long)(ref + 1);
147 ret = 0;
148 } else if (ret < 0) {
149 goto out;
150 } else {
151 ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
152 struct btrfs_inode_ref);
153 btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
154 btrfs_set_inode_ref_index(path->nodes[0], ref, index);
155 ptr = (unsigned long)(ref + 1);
156 }
157 write_extent_buffer(path->nodes[0], name, ptr, name_len);
158 btrfs_mark_buffer_dirty(path->nodes[0]);
159
160out:
161 btrfs_free_path(path);
162 return ret;
163}
164
165int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
166 struct btrfs_root *root,
167 struct btrfs_path *path, u64 objectid)
168{
169 struct btrfs_key key;
170 int ret;
171 key.objectid = objectid;
172 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
173 key.offset = 0;
174
175 ret = btrfs_insert_empty_item(trans, root, path, &key,
176 sizeof(struct btrfs_inode_item));
177 if (ret == 0 && objectid > root->highest_inode)
178 root->highest_inode = objectid;
179 return ret;
180}
181
182int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
183 *root, struct btrfs_path *path,
184 struct btrfs_key *location, int mod)
185{
186 int ins_len = mod < 0 ? -1 : 0;
187 int cow = mod != 0;
188 int ret;
189 int slot;
190 struct extent_buffer *leaf;
191 struct btrfs_key found_key;
192
193 ret = btrfs_search_slot(trans, root, location, path, ins_len, cow);
194 if (ret > 0 && btrfs_key_type(location) == BTRFS_ROOT_ITEM_KEY &&
195 location->offset == (u64)-1 && path->slots[0] != 0) {
196 slot = path->slots[0] - 1;
197 leaf = path->nodes[0];
198 btrfs_item_key_to_cpu(leaf, &found_key, slot);
199 if (found_key.objectid == location->objectid &&
200 btrfs_key_type(&found_key) == btrfs_key_type(location)) {
201 path->slots[0]--;
202 return 0;
203 }
204 }
205 return ret;
206}
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
new file mode 100644
index 000000000000..2aa79873eb46
--- /dev/null
+++ b/fs/btrfs/inode-map.c
@@ -0,0 +1,144 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include "ctree.h"
20#include "disk-io.h"
21#include "transaction.h"
22
23int btrfs_find_highest_inode(struct btrfs_root *root, u64 *objectid)
24{
25 struct btrfs_path *path;
26 int ret;
27 struct extent_buffer *l;
28 struct btrfs_key search_key;
29 struct btrfs_key found_key;
30 int slot;
31
32 path = btrfs_alloc_path();
33 BUG_ON(!path);
34
35 search_key.objectid = BTRFS_LAST_FREE_OBJECTID;
36 search_key.type = -1;
37 search_key.offset = (u64)-1;
38 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
39 if (ret < 0)
40 goto error;
41 BUG_ON(ret == 0);
42 if (path->slots[0] > 0) {
43 slot = path->slots[0] - 1;
44 l = path->nodes[0];
45 btrfs_item_key_to_cpu(l, &found_key, slot);
46 *objectid = found_key.objectid;
47 } else {
48 *objectid = BTRFS_FIRST_FREE_OBJECTID;
49 }
50 ret = 0;
51error:
52 btrfs_free_path(path);
53 return ret;
54}
55
56/*
57 * walks the btree of allocated inodes and find a hole.
58 */
59int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
60 struct btrfs_root *root,
61 u64 dirid, u64 *objectid)
62{
63 struct btrfs_path *path;
64 struct btrfs_key key;
65 int ret;
66 int slot = 0;
67 u64 last_ino = 0;
68 int start_found;
69 struct extent_buffer *l;
70 struct btrfs_key search_key;
71 u64 search_start = dirid;
72
73 mutex_lock(&root->objectid_mutex);
74 if (root->last_inode_alloc >= BTRFS_FIRST_FREE_OBJECTID &&
75 root->last_inode_alloc < BTRFS_LAST_FREE_OBJECTID) {
76 *objectid = ++root->last_inode_alloc;
77 mutex_unlock(&root->objectid_mutex);
78 return 0;
79 }
80 path = btrfs_alloc_path();
81 BUG_ON(!path);
82 search_start = max(search_start, BTRFS_FIRST_FREE_OBJECTID);
83 search_key.objectid = search_start;
84 search_key.type = 0;
85 search_key.offset = 0;
86
87 btrfs_init_path(path);
88 start_found = 0;
89 ret = btrfs_search_slot(trans, root, &search_key, path, 0, 0);
90 if (ret < 0)
91 goto error;
92
93 while (1) {
94 l = path->nodes[0];
95 slot = path->slots[0];
96 if (slot >= btrfs_header_nritems(l)) {
97 ret = btrfs_next_leaf(root, path);
98 if (ret == 0)
99 continue;
100 if (ret < 0)
101 goto error;
102 if (!start_found) {
103 *objectid = search_start;
104 start_found = 1;
105 goto found;
106 }
107 *objectid = last_ino > search_start ?
108 last_ino : search_start;
109 goto found;
110 }
111 btrfs_item_key_to_cpu(l, &key, slot);
112 if (key.objectid >= search_start) {
113 if (start_found) {
114 if (last_ino < search_start)
115 last_ino = search_start;
116 if (key.objectid > last_ino) {
117 *objectid = last_ino;
118 goto found;
119 }
120 } else if (key.objectid > search_start) {
121 *objectid = search_start;
122 goto found;
123 }
124 }
125 if (key.objectid >= BTRFS_LAST_FREE_OBJECTID)
126 break;
127
128 start_found = 1;
129 last_ino = key.objectid + 1;
130 path->slots[0]++;
131 }
132 BUG_ON(1);
133found:
134 btrfs_release_path(root, path);
135 btrfs_free_path(path);
136 BUG_ON(*objectid < search_start);
137 mutex_unlock(&root->objectid_mutex);
138 return 0;
139error:
140 btrfs_release_path(root, path);
141 btrfs_free_path(path);
142 mutex_unlock(&root->objectid_mutex);
143 return ret;
144}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
new file mode 100644
index 000000000000..8adfe059ab41
--- /dev/null
+++ b/fs/btrfs/inode.c
@@ -0,0 +1,5035 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/kernel.h>
20#include <linux/bio.h>
21#include <linux/buffer_head.h>
22#include <linux/file.h>
23#include <linux/fs.h>
24#include <linux/pagemap.h>
25#include <linux/highmem.h>
26#include <linux/time.h>
27#include <linux/init.h>
28#include <linux/string.h>
29#include <linux/smp_lock.h>
30#include <linux/backing-dev.h>
31#include <linux/mpage.h>
32#include <linux/swap.h>
33#include <linux/writeback.h>
34#include <linux/statfs.h>
35#include <linux/compat.h>
36#include <linux/bit_spinlock.h>
37#include <linux/version.h>
38#include <linux/xattr.h>
39#include <linux/posix_acl.h>
40#include <linux/falloc.h>
41#include "compat.h"
42#include "ctree.h"
43#include "disk-io.h"
44#include "transaction.h"
45#include "btrfs_inode.h"
46#include "ioctl.h"
47#include "print-tree.h"
48#include "volumes.h"
49#include "ordered-data.h"
50#include "xattr.h"
51#include "tree-log.h"
52#include "ref-cache.h"
53#include "compression.h"
54
55struct btrfs_iget_args {
56 u64 ino;
57 struct btrfs_root *root;
58};
59
60static struct inode_operations btrfs_dir_inode_operations;
61static struct inode_operations btrfs_symlink_inode_operations;
62static struct inode_operations btrfs_dir_ro_inode_operations;
63static struct inode_operations btrfs_special_inode_operations;
64static struct inode_operations btrfs_file_inode_operations;
65static struct address_space_operations btrfs_aops;
66static struct address_space_operations btrfs_symlink_aops;
67static struct file_operations btrfs_dir_file_operations;
68static struct extent_io_ops btrfs_extent_io_ops;
69
70static struct kmem_cache *btrfs_inode_cachep;
71struct kmem_cache *btrfs_trans_handle_cachep;
72struct kmem_cache *btrfs_transaction_cachep;
73struct kmem_cache *btrfs_bit_radix_cachep;
74struct kmem_cache *btrfs_path_cachep;
75
76#define S_SHIFT 12
77static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
78 [S_IFREG >> S_SHIFT] = BTRFS_FT_REG_FILE,
79 [S_IFDIR >> S_SHIFT] = BTRFS_FT_DIR,
80 [S_IFCHR >> S_SHIFT] = BTRFS_FT_CHRDEV,
81 [S_IFBLK >> S_SHIFT] = BTRFS_FT_BLKDEV,
82 [S_IFIFO >> S_SHIFT] = BTRFS_FT_FIFO,
83 [S_IFSOCK >> S_SHIFT] = BTRFS_FT_SOCK,
84 [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK,
85};
86
87static void btrfs_truncate(struct inode *inode);
88static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end);
89static noinline int cow_file_range(struct inode *inode,
90 struct page *locked_page,
91 u64 start, u64 end, int *page_started,
92 unsigned long *nr_written, int unlock);
93
94/*
95 * a very lame attempt at stopping writes when the FS is 85% full. There
96 * are countless ways this is incorrect, but it is better than nothing.
97 */
98int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
99 int for_del)
100{
101 u64 total;
102 u64 used;
103 u64 thresh;
104 int ret = 0;
105
106 spin_lock(&root->fs_info->delalloc_lock);
107 total = btrfs_super_total_bytes(&root->fs_info->super_copy);
108 used = btrfs_super_bytes_used(&root->fs_info->super_copy);
109 if (for_del)
110 thresh = total * 90;
111 else
112 thresh = total * 85;
113
114 do_div(thresh, 100);
115
116 if (used + root->fs_info->delalloc_bytes + num_required > thresh)
117 ret = -ENOSPC;
118 spin_unlock(&root->fs_info->delalloc_lock);
119 return ret;
120}
121
122/*
123 * this does all the hard work for inserting an inline extent into
124 * the btree. The caller should have done a btrfs_drop_extents so that
125 * no overlapping inline items exist in the btree
126 */
127static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
128 struct btrfs_root *root, struct inode *inode,
129 u64 start, size_t size, size_t compressed_size,
130 struct page **compressed_pages)
131{
132 struct btrfs_key key;
133 struct btrfs_path *path;
134 struct extent_buffer *leaf;
135 struct page *page = NULL;
136 char *kaddr;
137 unsigned long ptr;
138 struct btrfs_file_extent_item *ei;
139 int err = 0;
140 int ret;
141 size_t cur_size = size;
142 size_t datasize;
143 unsigned long offset;
144 int use_compress = 0;
145
146 if (compressed_size && compressed_pages) {
147 use_compress = 1;
148 cur_size = compressed_size;
149 }
150
151 path = btrfs_alloc_path();
152 if (!path)
153 return -ENOMEM;
154
155 btrfs_set_trans_block_group(trans, inode);
156
157 key.objectid = inode->i_ino;
158 key.offset = start;
159 btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
160 datasize = btrfs_file_extent_calc_inline_size(cur_size);
161
162 inode_add_bytes(inode, size);
163 ret = btrfs_insert_empty_item(trans, root, path, &key,
164 datasize);
165 BUG_ON(ret);
166 if (ret) {
167 err = ret;
168 goto fail;
169 }
170 leaf = path->nodes[0];
171 ei = btrfs_item_ptr(leaf, path->slots[0],
172 struct btrfs_file_extent_item);
173 btrfs_set_file_extent_generation(leaf, ei, trans->transid);
174 btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
175 btrfs_set_file_extent_encryption(leaf, ei, 0);
176 btrfs_set_file_extent_other_encoding(leaf, ei, 0);
177 btrfs_set_file_extent_ram_bytes(leaf, ei, size);
178 ptr = btrfs_file_extent_inline_start(ei);
179
180 if (use_compress) {
181 struct page *cpage;
182 int i = 0;
183 while (compressed_size > 0) {
184 cpage = compressed_pages[i];
185 cur_size = min_t(unsigned long, compressed_size,
186 PAGE_CACHE_SIZE);
187
188 kaddr = kmap(cpage);
189 write_extent_buffer(leaf, kaddr, ptr, cur_size);
190 kunmap(cpage);
191
192 i++;
193 ptr += cur_size;
194 compressed_size -= cur_size;
195 }
196 btrfs_set_file_extent_compression(leaf, ei,
197 BTRFS_COMPRESS_ZLIB);
198 } else {
199 page = find_get_page(inode->i_mapping,
200 start >> PAGE_CACHE_SHIFT);
201 btrfs_set_file_extent_compression(leaf, ei, 0);
202 kaddr = kmap_atomic(page, KM_USER0);
203 offset = start & (PAGE_CACHE_SIZE - 1);
204 write_extent_buffer(leaf, kaddr + offset, ptr, size);
205 kunmap_atomic(kaddr, KM_USER0);
206 page_cache_release(page);
207 }
208 btrfs_mark_buffer_dirty(leaf);
209 btrfs_free_path(path);
210
211 BTRFS_I(inode)->disk_i_size = inode->i_size;
212 btrfs_update_inode(trans, root, inode);
213 return 0;
214fail:
215 btrfs_free_path(path);
216 return err;
217}
218
219
220/*
221 * conditionally insert an inline extent into the file. This
222 * does the checks required to make sure the data is small enough
223 * to fit as an inline extent.
224 */
225static int cow_file_range_inline(struct btrfs_trans_handle *trans,
226 struct btrfs_root *root,
227 struct inode *inode, u64 start, u64 end,
228 size_t compressed_size,
229 struct page **compressed_pages)
230{
231 u64 isize = i_size_read(inode);
232 u64 actual_end = min(end + 1, isize);
233 u64 inline_len = actual_end - start;
234 u64 aligned_end = (end + root->sectorsize - 1) &
235 ~((u64)root->sectorsize - 1);
236 u64 hint_byte;
237 u64 data_len = inline_len;
238 int ret;
239
240 if (compressed_size)
241 data_len = compressed_size;
242
243 if (start > 0 ||
244 actual_end >= PAGE_CACHE_SIZE ||
245 data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
246 (!compressed_size &&
247 (actual_end & (root->sectorsize - 1)) == 0) ||
248 end + 1 < isize ||
249 data_len > root->fs_info->max_inline) {
250 return 1;
251 }
252
253 ret = btrfs_drop_extents(trans, root, inode, start,
254 aligned_end, start, &hint_byte);
255 BUG_ON(ret);
256
257 if (isize > actual_end)
258 inline_len = min_t(u64, isize, actual_end);
259 ret = insert_inline_extent(trans, root, inode, start,
260 inline_len, compressed_size,
261 compressed_pages);
262 BUG_ON(ret);
263 btrfs_drop_extent_cache(inode, start, aligned_end, 0);
264 return 0;
265}
266
267struct async_extent {
268 u64 start;
269 u64 ram_size;
270 u64 compressed_size;
271 struct page **pages;
272 unsigned long nr_pages;
273 struct list_head list;
274};
275
276struct async_cow {
277 struct inode *inode;
278 struct btrfs_root *root;
279 struct page *locked_page;
280 u64 start;
281 u64 end;
282 struct list_head extents;
283 struct btrfs_work work;
284};
285
286static noinline int add_async_extent(struct async_cow *cow,
287 u64 start, u64 ram_size,
288 u64 compressed_size,
289 struct page **pages,
290 unsigned long nr_pages)
291{
292 struct async_extent *async_extent;
293
294 async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
295 async_extent->start = start;
296 async_extent->ram_size = ram_size;
297 async_extent->compressed_size = compressed_size;
298 async_extent->pages = pages;
299 async_extent->nr_pages = nr_pages;
300 list_add_tail(&async_extent->list, &cow->extents);
301 return 0;
302}
303
304/*
305 * we create compressed extents in two phases. The first
306 * phase compresses a range of pages that have already been
307 * locked (both pages and state bits are locked).
308 *
309 * This is done inside an ordered work queue, and the compression
310 * is spread across many cpus. The actual IO submission is step
311 * two, and the ordered work queue takes care of making sure that
312 * happens in the same order things were put onto the queue by
313 * writepages and friends.
314 *
315 * If this code finds it can't get good compression, it puts an
316 * entry onto the work queue to write the uncompressed bytes. This
317 * makes sure that both compressed inodes and uncompressed inodes
318 * are written in the same order that pdflush sent them down.
319 */
320static noinline int compress_file_range(struct inode *inode,
321 struct page *locked_page,
322 u64 start, u64 end,
323 struct async_cow *async_cow,
324 int *num_added)
325{
326 struct btrfs_root *root = BTRFS_I(inode)->root;
327 struct btrfs_trans_handle *trans;
328 u64 num_bytes;
329 u64 orig_start;
330 u64 disk_num_bytes;
331 u64 blocksize = root->sectorsize;
332 u64 actual_end;
333 u64 isize = i_size_read(inode);
334 int ret = 0;
335 struct page **pages = NULL;
336 unsigned long nr_pages;
337 unsigned long nr_pages_ret = 0;
338 unsigned long total_compressed = 0;
339 unsigned long total_in = 0;
340 unsigned long max_compressed = 128 * 1024;
341 unsigned long max_uncompressed = 128 * 1024;
342 int i;
343 int will_compress;
344
345 orig_start = start;
346
347 actual_end = min_t(u64, isize, end + 1);
348again:
349 will_compress = 0;
350 nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
351 nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE);
352
353 total_compressed = actual_end - start;
354
355 /* we want to make sure that amount of ram required to uncompress
356 * an extent is reasonable, so we limit the total size in ram
357 * of a compressed extent to 128k. This is a crucial number
358 * because it also controls how easily we can spread reads across
359 * cpus for decompression.
360 *
361 * We also want to make sure the amount of IO required to do
362 * a random read is reasonably small, so we limit the size of
363 * a compressed extent to 128k.
364 */
365 total_compressed = min(total_compressed, max_uncompressed);
366 num_bytes = (end - start + blocksize) & ~(blocksize - 1);
367 num_bytes = max(blocksize, num_bytes);
368 disk_num_bytes = num_bytes;
369 total_in = 0;
370 ret = 0;
371
372 /*
373 * we do compression for mount -o compress and when the
374 * inode has not been flagged as nocompress. This flag can
375 * change at any time if we discover bad compression ratios.
376 */
377 if (!btrfs_test_flag(inode, NOCOMPRESS) &&
378 btrfs_test_opt(root, COMPRESS)) {
379 WARN_ON(pages);
380 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
381
382 ret = btrfs_zlib_compress_pages(inode->i_mapping, start,
383 total_compressed, pages,
384 nr_pages, &nr_pages_ret,
385 &total_in,
386 &total_compressed,
387 max_compressed);
388
389 if (!ret) {
390 unsigned long offset = total_compressed &
391 (PAGE_CACHE_SIZE - 1);
392 struct page *page = pages[nr_pages_ret - 1];
393 char *kaddr;
394
395 /* zero the tail end of the last page, we might be
396 * sending it down to disk
397 */
398 if (offset) {
399 kaddr = kmap_atomic(page, KM_USER0);
400 memset(kaddr + offset, 0,
401 PAGE_CACHE_SIZE - offset);
402 kunmap_atomic(kaddr, KM_USER0);
403 }
404 will_compress = 1;
405 }
406 }
407 if (start == 0) {
408 trans = btrfs_join_transaction(root, 1);
409 BUG_ON(!trans);
410 btrfs_set_trans_block_group(trans, inode);
411
412 /* lets try to make an inline extent */
413 if (ret || total_in < (actual_end - start)) {
414 /* we didn't compress the entire range, try
415 * to make an uncompressed inline extent.
416 */
417 ret = cow_file_range_inline(trans, root, inode,
418 start, end, 0, NULL);
419 } else {
420 /* try making a compressed inline extent */
421 ret = cow_file_range_inline(trans, root, inode,
422 start, end,
423 total_compressed, pages);
424 }
425 btrfs_end_transaction(trans, root);
426 if (ret == 0) {
427 /*
428 * inline extent creation worked, we don't need
429 * to create any more async work items. Unlock
430 * and free up our temp pages.
431 */
432 extent_clear_unlock_delalloc(inode,
433 &BTRFS_I(inode)->io_tree,
434 start, end, NULL, 1, 0,
435 0, 1, 1, 1);
436 ret = 0;
437 goto free_pages_out;
438 }
439 }
440
441 if (will_compress) {
442 /*
443 * we aren't doing an inline extent round the compressed size
444 * up to a block size boundary so the allocator does sane
445 * things
446 */
447 total_compressed = (total_compressed + blocksize - 1) &
448 ~(blocksize - 1);
449
450 /*
451 * one last check to make sure the compression is really a
452 * win, compare the page count read with the blocks on disk
453 */
454 total_in = (total_in + PAGE_CACHE_SIZE - 1) &
455 ~(PAGE_CACHE_SIZE - 1);
456 if (total_compressed >= total_in) {
457 will_compress = 0;
458 } else {
459 disk_num_bytes = total_compressed;
460 num_bytes = total_in;
461 }
462 }
463 if (!will_compress && pages) {
464 /*
465 * the compression code ran but failed to make things smaller,
466 * free any pages it allocated and our page pointer array
467 */
468 for (i = 0; i < nr_pages_ret; i++) {
469 WARN_ON(pages[i]->mapping);
470 page_cache_release(pages[i]);
471 }
472 kfree(pages);
473 pages = NULL;
474 total_compressed = 0;
475 nr_pages_ret = 0;
476
477 /* flag the file so we don't compress in the future */
478 btrfs_set_flag(inode, NOCOMPRESS);
479 }
480 if (will_compress) {
481 *num_added += 1;
482
483 /* the async work queues will take care of doing actual
484 * allocation on disk for these compressed pages,
485 * and will submit them to the elevator.
486 */
487 add_async_extent(async_cow, start, num_bytes,
488 total_compressed, pages, nr_pages_ret);
489
490 if (start + num_bytes < end && start + num_bytes < actual_end) {
491 start += num_bytes;
492 pages = NULL;
493 cond_resched();
494 goto again;
495 }
496 } else {
497 /*
498 * No compression, but we still need to write the pages in
499 * the file we've been given so far. redirty the locked
500 * page if it corresponds to our extent and set things up
501 * for the async work queue to run cow_file_range to do
502 * the normal delalloc dance
503 */
504 if (page_offset(locked_page) >= start &&
505 page_offset(locked_page) <= end) {
506 __set_page_dirty_nobuffers(locked_page);
507 /* unlocked later on in the async handlers */
508 }
509 add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0);
510 *num_added += 1;
511 }
512
513out:
514 return 0;
515
516free_pages_out:
517 for (i = 0; i < nr_pages_ret; i++) {
518 WARN_ON(pages[i]->mapping);
519 page_cache_release(pages[i]);
520 }
521 kfree(pages);
522
523 goto out;
524}
525
526/*
527 * phase two of compressed writeback. This is the ordered portion
528 * of the code, which only gets called in the order the work was
529 * queued. We walk all the async extents created by compress_file_range
530 * and send them down to the disk.
531 */
532static noinline int submit_compressed_extents(struct inode *inode,
533 struct async_cow *async_cow)
534{
535 struct async_extent *async_extent;
536 u64 alloc_hint = 0;
537 struct btrfs_trans_handle *trans;
538 struct btrfs_key ins;
539 struct extent_map *em;
540 struct btrfs_root *root = BTRFS_I(inode)->root;
541 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
542 struct extent_io_tree *io_tree;
543 int ret;
544
545 if (list_empty(&async_cow->extents))
546 return 0;
547
548 trans = btrfs_join_transaction(root, 1);
549
550 while (!list_empty(&async_cow->extents)) {
551 async_extent = list_entry(async_cow->extents.next,
552 struct async_extent, list);
553 list_del(&async_extent->list);
554
555 io_tree = &BTRFS_I(inode)->io_tree;
556
557 /* did the compression code fall back to uncompressed IO? */
558 if (!async_extent->pages) {
559 int page_started = 0;
560 unsigned long nr_written = 0;
561
562 lock_extent(io_tree, async_extent->start,
563 async_extent->start +
564 async_extent->ram_size - 1, GFP_NOFS);
565
566 /* allocate blocks */
567 cow_file_range(inode, async_cow->locked_page,
568 async_extent->start,
569 async_extent->start +
570 async_extent->ram_size - 1,
571 &page_started, &nr_written, 0);
572
573 /*
574 * if page_started, cow_file_range inserted an
575 * inline extent and took care of all the unlocking
576 * and IO for us. Otherwise, we need to submit
577 * all those pages down to the drive.
578 */
579 if (!page_started)
580 extent_write_locked_range(io_tree,
581 inode, async_extent->start,
582 async_extent->start +
583 async_extent->ram_size - 1,
584 btrfs_get_extent,
585 WB_SYNC_ALL);
586 kfree(async_extent);
587 cond_resched();
588 continue;
589 }
590
591 lock_extent(io_tree, async_extent->start,
592 async_extent->start + async_extent->ram_size - 1,
593 GFP_NOFS);
594 /*
595 * here we're doing allocation and writeback of the
596 * compressed pages
597 */
598 btrfs_drop_extent_cache(inode, async_extent->start,
599 async_extent->start +
600 async_extent->ram_size - 1, 0);
601
602 ret = btrfs_reserve_extent(trans, root,
603 async_extent->compressed_size,
604 async_extent->compressed_size,
605 0, alloc_hint,
606 (u64)-1, &ins, 1);
607 BUG_ON(ret);
608 em = alloc_extent_map(GFP_NOFS);
609 em->start = async_extent->start;
610 em->len = async_extent->ram_size;
611 em->orig_start = em->start;
612
613 em->block_start = ins.objectid;
614 em->block_len = ins.offset;
615 em->bdev = root->fs_info->fs_devices->latest_bdev;
616 set_bit(EXTENT_FLAG_PINNED, &em->flags);
617 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
618
619 while (1) {
620 spin_lock(&em_tree->lock);
621 ret = add_extent_mapping(em_tree, em);
622 spin_unlock(&em_tree->lock);
623 if (ret != -EEXIST) {
624 free_extent_map(em);
625 break;
626 }
627 btrfs_drop_extent_cache(inode, async_extent->start,
628 async_extent->start +
629 async_extent->ram_size - 1, 0);
630 }
631
632 ret = btrfs_add_ordered_extent(inode, async_extent->start,
633 ins.objectid,
634 async_extent->ram_size,
635 ins.offset,
636 BTRFS_ORDERED_COMPRESSED);
637 BUG_ON(ret);
638
639 btrfs_end_transaction(trans, root);
640
641 /*
642 * clear dirty, set writeback and unlock the pages.
643 */
644 extent_clear_unlock_delalloc(inode,
645 &BTRFS_I(inode)->io_tree,
646 async_extent->start,
647 async_extent->start +
648 async_extent->ram_size - 1,
649 NULL, 1, 1, 0, 1, 1, 0);
650
651 ret = btrfs_submit_compressed_write(inode,
652 async_extent->start,
653 async_extent->ram_size,
654 ins.objectid,
655 ins.offset, async_extent->pages,
656 async_extent->nr_pages);
657
658 BUG_ON(ret);
659 trans = btrfs_join_transaction(root, 1);
660 alloc_hint = ins.objectid + ins.offset;
661 kfree(async_extent);
662 cond_resched();
663 }
664
665 btrfs_end_transaction(trans, root);
666 return 0;
667}
668
669/*
670 * when extent_io.c finds a delayed allocation range in the file,
671 * the call backs end up in this code. The basic idea is to
672 * allocate extents on disk for the range, and create ordered data structs
673 * in ram to track those extents.
674 *
675 * locked_page is the page that writepage had locked already. We use
676 * it to make sure we don't do extra locks or unlocks.
677 *
678 * *page_started is set to one if we unlock locked_page and do everything
679 * required to start IO on it. It may be clean and already done with
680 * IO when we return.
681 */
682static noinline int cow_file_range(struct inode *inode,
683 struct page *locked_page,
684 u64 start, u64 end, int *page_started,
685 unsigned long *nr_written,
686 int unlock)
687{
688 struct btrfs_root *root = BTRFS_I(inode)->root;
689 struct btrfs_trans_handle *trans;
690 u64 alloc_hint = 0;
691 u64 num_bytes;
692 unsigned long ram_size;
693 u64 disk_num_bytes;
694 u64 cur_alloc_size;
695 u64 blocksize = root->sectorsize;
696 u64 actual_end;
697 u64 isize = i_size_read(inode);
698 struct btrfs_key ins;
699 struct extent_map *em;
700 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
701 int ret = 0;
702
703 trans = btrfs_join_transaction(root, 1);
704 BUG_ON(!trans);
705 btrfs_set_trans_block_group(trans, inode);
706
707 actual_end = min_t(u64, isize, end + 1);
708
709 num_bytes = (end - start + blocksize) & ~(blocksize - 1);
710 num_bytes = max(blocksize, num_bytes);
711 disk_num_bytes = num_bytes;
712 ret = 0;
713
714 if (start == 0) {
715 /* lets try to make an inline extent */
716 ret = cow_file_range_inline(trans, root, inode,
717 start, end, 0, NULL);
718 if (ret == 0) {
719 extent_clear_unlock_delalloc(inode,
720 &BTRFS_I(inode)->io_tree,
721 start, end, NULL, 1, 1,
722 1, 1, 1, 1);
723 *nr_written = *nr_written +
724 (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
725 *page_started = 1;
726 ret = 0;
727 goto out;
728 }
729 }
730
731 BUG_ON(disk_num_bytes >
732 btrfs_super_total_bytes(&root->fs_info->super_copy));
733
734 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
735
736 while (disk_num_bytes > 0) {
737 cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent);
738 ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
739 root->sectorsize, 0, alloc_hint,
740 (u64)-1, &ins, 1);
741 BUG_ON(ret);
742
743 em = alloc_extent_map(GFP_NOFS);
744 em->start = start;
745 em->orig_start = em->start;
746
747 ram_size = ins.offset;
748 em->len = ins.offset;
749
750 em->block_start = ins.objectid;
751 em->block_len = ins.offset;
752 em->bdev = root->fs_info->fs_devices->latest_bdev;
753 set_bit(EXTENT_FLAG_PINNED, &em->flags);
754
755 while (1) {
756 spin_lock(&em_tree->lock);
757 ret = add_extent_mapping(em_tree, em);
758 spin_unlock(&em_tree->lock);
759 if (ret != -EEXIST) {
760 free_extent_map(em);
761 break;
762 }
763 btrfs_drop_extent_cache(inode, start,
764 start + ram_size - 1, 0);
765 }
766
767 cur_alloc_size = ins.offset;
768 ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
769 ram_size, cur_alloc_size, 0);
770 BUG_ON(ret);
771
772 if (root->root_key.objectid ==
773 BTRFS_DATA_RELOC_TREE_OBJECTID) {
774 ret = btrfs_reloc_clone_csums(inode, start,
775 cur_alloc_size);
776 BUG_ON(ret);
777 }
778
779 if (disk_num_bytes < cur_alloc_size)
780 break;
781
782 /* we're not doing compressed IO, don't unlock the first
783 * page (which the caller expects to stay locked), don't
784 * clear any dirty bits and don't set any writeback bits
785 */
786 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
787 start, start + ram_size - 1,
788 locked_page, unlock, 1,
789 1, 0, 0, 0);
790 disk_num_bytes -= cur_alloc_size;
791 num_bytes -= cur_alloc_size;
792 alloc_hint = ins.objectid + ins.offset;
793 start += cur_alloc_size;
794 }
795out:
796 ret = 0;
797 btrfs_end_transaction(trans, root);
798
799 return ret;
800}
801
802/*
803 * work queue call back to started compression on a file and pages
804 */
805static noinline void async_cow_start(struct btrfs_work *work)
806{
807 struct async_cow *async_cow;
808 int num_added = 0;
809 async_cow = container_of(work, struct async_cow, work);
810
811 compress_file_range(async_cow->inode, async_cow->locked_page,
812 async_cow->start, async_cow->end, async_cow,
813 &num_added);
814 if (num_added == 0)
815 async_cow->inode = NULL;
816}
817
818/*
819 * work queue call back to submit previously compressed pages
820 */
821static noinline void async_cow_submit(struct btrfs_work *work)
822{
823 struct async_cow *async_cow;
824 struct btrfs_root *root;
825 unsigned long nr_pages;
826
827 async_cow = container_of(work, struct async_cow, work);
828
829 root = async_cow->root;
830 nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >>
831 PAGE_CACHE_SHIFT;
832
833 atomic_sub(nr_pages, &root->fs_info->async_delalloc_pages);
834
835 if (atomic_read(&root->fs_info->async_delalloc_pages) <
836 5 * 1042 * 1024 &&
837 waitqueue_active(&root->fs_info->async_submit_wait))
838 wake_up(&root->fs_info->async_submit_wait);
839
840 if (async_cow->inode)
841 submit_compressed_extents(async_cow->inode, async_cow);
842}
843
844static noinline void async_cow_free(struct btrfs_work *work)
845{
846 struct async_cow *async_cow;
847 async_cow = container_of(work, struct async_cow, work);
848 kfree(async_cow);
849}
850
851static int cow_file_range_async(struct inode *inode, struct page *locked_page,
852 u64 start, u64 end, int *page_started,
853 unsigned long *nr_written)
854{
855 struct async_cow *async_cow;
856 struct btrfs_root *root = BTRFS_I(inode)->root;
857 unsigned long nr_pages;
858 u64 cur_end;
859 int limit = 10 * 1024 * 1042;
860
861 if (!btrfs_test_opt(root, COMPRESS)) {
862 return cow_file_range(inode, locked_page, start, end,
863 page_started, nr_written, 1);
864 }
865
866 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED |
867 EXTENT_DELALLOC, 1, 0, GFP_NOFS);
868 while (start < end) {
869 async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
870 async_cow->inode = inode;
871 async_cow->root = root;
872 async_cow->locked_page = locked_page;
873 async_cow->start = start;
874
875 if (btrfs_test_flag(inode, NOCOMPRESS))
876 cur_end = end;
877 else
878 cur_end = min(end, start + 512 * 1024 - 1);
879
880 async_cow->end = cur_end;
881 INIT_LIST_HEAD(&async_cow->extents);
882
883 async_cow->work.func = async_cow_start;
884 async_cow->work.ordered_func = async_cow_submit;
885 async_cow->work.ordered_free = async_cow_free;
886 async_cow->work.flags = 0;
887
888 nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
889 PAGE_CACHE_SHIFT;
890 atomic_add(nr_pages, &root->fs_info->async_delalloc_pages);
891
892 btrfs_queue_worker(&root->fs_info->delalloc_workers,
893 &async_cow->work);
894
895 if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) {
896 wait_event(root->fs_info->async_submit_wait,
897 (atomic_read(&root->fs_info->async_delalloc_pages) <
898 limit));
899 }
900
901 while (atomic_read(&root->fs_info->async_submit_draining) &&
902 atomic_read(&root->fs_info->async_delalloc_pages)) {
903 wait_event(root->fs_info->async_submit_wait,
904 (atomic_read(&root->fs_info->async_delalloc_pages) ==
905 0));
906 }
907
908 *nr_written += nr_pages;
909 start = cur_end + 1;
910 }
911 *page_started = 1;
912 return 0;
913}
914
915static noinline int csum_exist_in_range(struct btrfs_root *root,
916 u64 bytenr, u64 num_bytes)
917{
918 int ret;
919 struct btrfs_ordered_sum *sums;
920 LIST_HEAD(list);
921
922 ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr,
923 bytenr + num_bytes - 1, &list);
924 if (ret == 0 && list_empty(&list))
925 return 0;
926
927 while (!list_empty(&list)) {
928 sums = list_entry(list.next, struct btrfs_ordered_sum, list);
929 list_del(&sums->list);
930 kfree(sums);
931 }
932 return 1;
933}
934
935/*
936 * when nowcow writeback call back. This checks for snapshots or COW copies
937 * of the extents that exist in the file, and COWs the file as required.
938 *
939 * If no cow copies or snapshots exist, we write directly to the existing
940 * blocks on disk
941 */
942static int run_delalloc_nocow(struct inode *inode, struct page *locked_page,
943 u64 start, u64 end, int *page_started, int force,
944 unsigned long *nr_written)
945{
946 struct btrfs_root *root = BTRFS_I(inode)->root;
947 struct btrfs_trans_handle *trans;
948 struct extent_buffer *leaf;
949 struct btrfs_path *path;
950 struct btrfs_file_extent_item *fi;
951 struct btrfs_key found_key;
952 u64 cow_start;
953 u64 cur_offset;
954 u64 extent_end;
955 u64 disk_bytenr;
956 u64 num_bytes;
957 int extent_type;
958 int ret;
959 int type;
960 int nocow;
961 int check_prev = 1;
962
963 path = btrfs_alloc_path();
964 BUG_ON(!path);
965 trans = btrfs_join_transaction(root, 1);
966 BUG_ON(!trans);
967
968 cow_start = (u64)-1;
969 cur_offset = start;
970 while (1) {
971 ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
972 cur_offset, 0);
973 BUG_ON(ret < 0);
974 if (ret > 0 && path->slots[0] > 0 && check_prev) {
975 leaf = path->nodes[0];
976 btrfs_item_key_to_cpu(leaf, &found_key,
977 path->slots[0] - 1);
978 if (found_key.objectid == inode->i_ino &&
979 found_key.type == BTRFS_EXTENT_DATA_KEY)
980 path->slots[0]--;
981 }
982 check_prev = 0;
983next_slot:
984 leaf = path->nodes[0];
985 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
986 ret = btrfs_next_leaf(root, path);
987 if (ret < 0)
988 BUG_ON(1);
989 if (ret > 0)
990 break;
991 leaf = path->nodes[0];
992 }
993
994 nocow = 0;
995 disk_bytenr = 0;
996 num_bytes = 0;
997 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
998
999 if (found_key.objectid > inode->i_ino ||
1000 found_key.type > BTRFS_EXTENT_DATA_KEY ||
1001 found_key.offset > end)
1002 break;
1003
1004 if (found_key.offset > cur_offset) {
1005 extent_end = found_key.offset;
1006 goto out_check;
1007 }
1008
1009 fi = btrfs_item_ptr(leaf, path->slots[0],
1010 struct btrfs_file_extent_item);
1011 extent_type = btrfs_file_extent_type(leaf, fi);
1012
1013 if (extent_type == BTRFS_FILE_EXTENT_REG ||
1014 extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1015 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1016 extent_end = found_key.offset +
1017 btrfs_file_extent_num_bytes(leaf, fi);
1018 if (extent_end <= start) {
1019 path->slots[0]++;
1020 goto next_slot;
1021 }
1022 if (disk_bytenr == 0)
1023 goto out_check;
1024 if (btrfs_file_extent_compression(leaf, fi) ||
1025 btrfs_file_extent_encryption(leaf, fi) ||
1026 btrfs_file_extent_other_encoding(leaf, fi))
1027 goto out_check;
1028 if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
1029 goto out_check;
1030 if (btrfs_extent_readonly(root, disk_bytenr))
1031 goto out_check;
1032 if (btrfs_cross_ref_exist(trans, root, inode->i_ino,
1033 disk_bytenr))
1034 goto out_check;
1035 disk_bytenr += btrfs_file_extent_offset(leaf, fi);
1036 disk_bytenr += cur_offset - found_key.offset;
1037 num_bytes = min(end + 1, extent_end) - cur_offset;
1038 /*
1039 * force cow if csum exists in the range.
1040 * this ensure that csum for a given extent are
1041 * either valid or do not exist.
1042 */
1043 if (csum_exist_in_range(root, disk_bytenr, num_bytes))
1044 goto out_check;
1045 nocow = 1;
1046 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1047 extent_end = found_key.offset +
1048 btrfs_file_extent_inline_len(leaf, fi);
1049 extent_end = ALIGN(extent_end, root->sectorsize);
1050 } else {
1051 BUG_ON(1);
1052 }
1053out_check:
1054 if (extent_end <= start) {
1055 path->slots[0]++;
1056 goto next_slot;
1057 }
1058 if (!nocow) {
1059 if (cow_start == (u64)-1)
1060 cow_start = cur_offset;
1061 cur_offset = extent_end;
1062 if (cur_offset > end)
1063 break;
1064 path->slots[0]++;
1065 goto next_slot;
1066 }
1067
1068 btrfs_release_path(root, path);
1069 if (cow_start != (u64)-1) {
1070 ret = cow_file_range(inode, locked_page, cow_start,
1071 found_key.offset - 1, page_started,
1072 nr_written, 1);
1073 BUG_ON(ret);
1074 cow_start = (u64)-1;
1075 }
1076
1077 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1078 struct extent_map *em;
1079 struct extent_map_tree *em_tree;
1080 em_tree = &BTRFS_I(inode)->extent_tree;
1081 em = alloc_extent_map(GFP_NOFS);
1082 em->start = cur_offset;
1083 em->orig_start = em->start;
1084 em->len = num_bytes;
1085 em->block_len = num_bytes;
1086 em->block_start = disk_bytenr;
1087 em->bdev = root->fs_info->fs_devices->latest_bdev;
1088 set_bit(EXTENT_FLAG_PINNED, &em->flags);
1089 while (1) {
1090 spin_lock(&em_tree->lock);
1091 ret = add_extent_mapping(em_tree, em);
1092 spin_unlock(&em_tree->lock);
1093 if (ret != -EEXIST) {
1094 free_extent_map(em);
1095 break;
1096 }
1097 btrfs_drop_extent_cache(inode, em->start,
1098 em->start + em->len - 1, 0);
1099 }
1100 type = BTRFS_ORDERED_PREALLOC;
1101 } else {
1102 type = BTRFS_ORDERED_NOCOW;
1103 }
1104
1105 ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
1106 num_bytes, num_bytes, type);
1107 BUG_ON(ret);
1108
1109 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
1110 cur_offset, cur_offset + num_bytes - 1,
1111 locked_page, 1, 1, 1, 0, 0, 0);
1112 cur_offset = extent_end;
1113 if (cur_offset > end)
1114 break;
1115 }
1116 btrfs_release_path(root, path);
1117
1118 if (cur_offset <= end && cow_start == (u64)-1)
1119 cow_start = cur_offset;
1120 if (cow_start != (u64)-1) {
1121 ret = cow_file_range(inode, locked_page, cow_start, end,
1122 page_started, nr_written, 1);
1123 BUG_ON(ret);
1124 }
1125
1126 ret = btrfs_end_transaction(trans, root);
1127 BUG_ON(ret);
1128 btrfs_free_path(path);
1129 return 0;
1130}
1131
1132/*
1133 * extent_io.c call back to do delayed allocation processing
1134 */
1135static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1136 u64 start, u64 end, int *page_started,
1137 unsigned long *nr_written)
1138{
1139 int ret;
1140
1141 if (btrfs_test_flag(inode, NODATACOW))
1142 ret = run_delalloc_nocow(inode, locked_page, start, end,
1143 page_started, 1, nr_written);
1144 else if (btrfs_test_flag(inode, PREALLOC))
1145 ret = run_delalloc_nocow(inode, locked_page, start, end,
1146 page_started, 0, nr_written);
1147 else
1148 ret = cow_file_range_async(inode, locked_page, start, end,
1149 page_started, nr_written);
1150
1151 return ret;
1152}
1153
1154/*
1155 * extent_io.c set_bit_hook, used to track delayed allocation
1156 * bytes in this file, and to maintain the list of inodes that
1157 * have pending delalloc work to be done.
1158 */
1159static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
1160 unsigned long old, unsigned long bits)
1161{
1162 /*
1163 * set_bit and clear bit hooks normally require _irqsave/restore
1164 * but in this case, we are only testeing for the DELALLOC
1165 * bit, which is only set or cleared with irqs on
1166 */
1167 if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
1168 struct btrfs_root *root = BTRFS_I(inode)->root;
1169 spin_lock(&root->fs_info->delalloc_lock);
1170 BTRFS_I(inode)->delalloc_bytes += end - start + 1;
1171 root->fs_info->delalloc_bytes += end - start + 1;
1172 if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1173 list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1174 &root->fs_info->delalloc_inodes);
1175 }
1176 spin_unlock(&root->fs_info->delalloc_lock);
1177 }
1178 return 0;
1179}
1180
1181/*
1182 * extent_io.c clear_bit_hook, see set_bit_hook for why
1183 */
1184static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
1185 unsigned long old, unsigned long bits)
1186{
1187 /*
1188 * set_bit and clear bit hooks normally require _irqsave/restore
1189 * but in this case, we are only testeing for the DELALLOC
1190 * bit, which is only set or cleared with irqs on
1191 */
1192 if ((old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
1193 struct btrfs_root *root = BTRFS_I(inode)->root;
1194
1195 spin_lock(&root->fs_info->delalloc_lock);
1196 if (end - start + 1 > root->fs_info->delalloc_bytes) {
1197 printk(KERN_INFO "btrfs warning: delalloc account "
1198 "%llu %llu\n",
1199 (unsigned long long)end - start + 1,
1200 (unsigned long long)
1201 root->fs_info->delalloc_bytes);
1202 root->fs_info->delalloc_bytes = 0;
1203 BTRFS_I(inode)->delalloc_bytes = 0;
1204 } else {
1205 root->fs_info->delalloc_bytes -= end - start + 1;
1206 BTRFS_I(inode)->delalloc_bytes -= end - start + 1;
1207 }
1208 if (BTRFS_I(inode)->delalloc_bytes == 0 &&
1209 !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1210 list_del_init(&BTRFS_I(inode)->delalloc_inodes);
1211 }
1212 spin_unlock(&root->fs_info->delalloc_lock);
1213 }
1214 return 0;
1215}
1216
1217/*
1218 * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
1219 * we don't create bios that span stripes or chunks
1220 */
1221int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1222 size_t size, struct bio *bio,
1223 unsigned long bio_flags)
1224{
1225 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
1226 struct btrfs_mapping_tree *map_tree;
1227 u64 logical = (u64)bio->bi_sector << 9;
1228 u64 length = 0;
1229 u64 map_length;
1230 int ret;
1231
1232 if (bio_flags & EXTENT_BIO_COMPRESSED)
1233 return 0;
1234
1235 length = bio->bi_size;
1236 map_tree = &root->fs_info->mapping_tree;
1237 map_length = length;
1238 ret = btrfs_map_block(map_tree, READ, logical,
1239 &map_length, NULL, 0);
1240
1241 if (map_length < length + size)
1242 return 1;
1243 return 0;
1244}
1245
1246/*
1247 * in order to insert checksums into the metadata in large chunks,
1248 * we wait until bio submission time. All the pages in the bio are
1249 * checksummed and sums are attached onto the ordered extent record.
1250 *
1251 * At IO completion time the cums attached on the ordered extent record
1252 * are inserted into the btree
1253 */
1254static int __btrfs_submit_bio_start(struct inode *inode, int rw,
1255 struct bio *bio, int mirror_num,
1256 unsigned long bio_flags)
1257{
1258 struct btrfs_root *root = BTRFS_I(inode)->root;
1259 int ret = 0;
1260
1261 ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
1262 BUG_ON(ret);
1263 return 0;
1264}
1265
1266/*
1267 * in order to insert checksums into the metadata in large chunks,
1268 * we wait until bio submission time. All the pages in the bio are
1269 * checksummed and sums are attached onto the ordered extent record.
1270 *
1271 * At IO completion time the cums attached on the ordered extent record
1272 * are inserted into the btree
1273 */
1274static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
1275 int mirror_num, unsigned long bio_flags)
1276{
1277 struct btrfs_root *root = BTRFS_I(inode)->root;
1278 return btrfs_map_bio(root, rw, bio, mirror_num, 1);
1279}
1280
1281/*
1282 * extent_io.c submission hook. This does the right thing for csum calculation
1283 * on write, or reading the csums from the tree before a read
1284 */
1285static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1286 int mirror_num, unsigned long bio_flags)
1287{
1288 struct btrfs_root *root = BTRFS_I(inode)->root;
1289 int ret = 0;
1290 int skip_sum;
1291
1292 skip_sum = btrfs_test_flag(inode, NODATASUM);
1293
1294 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
1295 BUG_ON(ret);
1296
1297 if (!(rw & (1 << BIO_RW))) {
1298 if (bio_flags & EXTENT_BIO_COMPRESSED) {
1299 return btrfs_submit_compressed_read(inode, bio,
1300 mirror_num, bio_flags);
1301 } else if (!skip_sum)
1302 btrfs_lookup_bio_sums(root, inode, bio, NULL);
1303 goto mapit;
1304 } else if (!skip_sum) {
1305 /* csum items have already been cloned */
1306 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
1307 goto mapit;
1308 /* we're doing a write, do the async checksumming */
1309 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
1310 inode, rw, bio, mirror_num,
1311 bio_flags, __btrfs_submit_bio_start,
1312 __btrfs_submit_bio_done);
1313 }
1314
1315mapit:
1316 return btrfs_map_bio(root, rw, bio, mirror_num, 0);
1317}
1318
1319/*
1320 * given a list of ordered sums record them in the inode. This happens
1321 * at IO completion time based on sums calculated at bio submission time.
1322 */
1323static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
1324 struct inode *inode, u64 file_offset,
1325 struct list_head *list)
1326{
1327 struct list_head *cur;
1328 struct btrfs_ordered_sum *sum;
1329
1330 btrfs_set_trans_block_group(trans, inode);
1331 list_for_each(cur, list) {
1332 sum = list_entry(cur, struct btrfs_ordered_sum, list);
1333 btrfs_csum_file_blocks(trans,
1334 BTRFS_I(inode)->root->fs_info->csum_root, sum);
1335 }
1336 return 0;
1337}
1338
1339int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end)
1340{
1341 if ((end & (PAGE_CACHE_SIZE - 1)) == 0)
1342 WARN_ON(1);
1343 return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
1344 GFP_NOFS);
1345}
1346
1347/* see btrfs_writepage_start_hook for details on why this is required */
1348struct btrfs_writepage_fixup {
1349 struct page *page;
1350 struct btrfs_work work;
1351};
1352
1353static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
1354{
1355 struct btrfs_writepage_fixup *fixup;
1356 struct btrfs_ordered_extent *ordered;
1357 struct page *page;
1358 struct inode *inode;
1359 u64 page_start;
1360 u64 page_end;
1361
1362 fixup = container_of(work, struct btrfs_writepage_fixup, work);
1363 page = fixup->page;
1364again:
1365 lock_page(page);
1366 if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
1367 ClearPageChecked(page);
1368 goto out_page;
1369 }
1370
1371 inode = page->mapping->host;
1372 page_start = page_offset(page);
1373 page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
1374
1375 lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
1376
1377 /* already ordered? We're done */
1378 if (test_range_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
1379 EXTENT_ORDERED, 0)) {
1380 goto out;
1381 }
1382
1383 ordered = btrfs_lookup_ordered_extent(inode, page_start);
1384 if (ordered) {
1385 unlock_extent(&BTRFS_I(inode)->io_tree, page_start,
1386 page_end, GFP_NOFS);
1387 unlock_page(page);
1388 btrfs_start_ordered_extent(inode, ordered, 1);
1389 goto again;
1390 }
1391
1392 btrfs_set_extent_delalloc(inode, page_start, page_end);
1393 ClearPageChecked(page);
1394out:
1395 unlock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
1396out_page:
1397 unlock_page(page);
1398 page_cache_release(page);
1399}
1400
1401/*
1402 * There are a few paths in the higher layers of the kernel that directly
1403 * set the page dirty bit without asking the filesystem if it is a
1404 * good idea. This causes problems because we want to make sure COW
1405 * properly happens and the data=ordered rules are followed.
1406 *
1407 * In our case any range that doesn't have the ORDERED bit set
1408 * hasn't been properly setup for IO. We kick off an async process
1409 * to fix it up. The async helper will wait for ordered extents, set
1410 * the delalloc bit and make it safe to write the page.
1411 */
1412static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
1413{
1414 struct inode *inode = page->mapping->host;
1415 struct btrfs_writepage_fixup *fixup;
1416 struct btrfs_root *root = BTRFS_I(inode)->root;
1417 int ret;
1418
1419 ret = test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
1420 EXTENT_ORDERED, 0);
1421 if (ret)
1422 return 0;
1423
1424 if (PageChecked(page))
1425 return -EAGAIN;
1426
1427 fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
1428 if (!fixup)
1429 return -EAGAIN;
1430
1431 SetPageChecked(page);
1432 page_cache_get(page);
1433 fixup->work.func = btrfs_writepage_fixup_worker;
1434 fixup->page = page;
1435 btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work);
1436 return -EAGAIN;
1437}
1438
1439static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1440 struct inode *inode, u64 file_pos,
1441 u64 disk_bytenr, u64 disk_num_bytes,
1442 u64 num_bytes, u64 ram_bytes,
1443 u8 compression, u8 encryption,
1444 u16 other_encoding, int extent_type)
1445{
1446 struct btrfs_root *root = BTRFS_I(inode)->root;
1447 struct btrfs_file_extent_item *fi;
1448 struct btrfs_path *path;
1449 struct extent_buffer *leaf;
1450 struct btrfs_key ins;
1451 u64 hint;
1452 int ret;
1453
1454 path = btrfs_alloc_path();
1455 BUG_ON(!path);
1456
1457 ret = btrfs_drop_extents(trans, root, inode, file_pos,
1458 file_pos + num_bytes, file_pos, &hint);
1459 BUG_ON(ret);
1460
1461 ins.objectid = inode->i_ino;
1462 ins.offset = file_pos;
1463 ins.type = BTRFS_EXTENT_DATA_KEY;
1464 ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*fi));
1465 BUG_ON(ret);
1466 leaf = path->nodes[0];
1467 fi = btrfs_item_ptr(leaf, path->slots[0],
1468 struct btrfs_file_extent_item);
1469 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1470 btrfs_set_file_extent_type(leaf, fi, extent_type);
1471 btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
1472 btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes);
1473 btrfs_set_file_extent_offset(leaf, fi, 0);
1474 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
1475 btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
1476 btrfs_set_file_extent_compression(leaf, fi, compression);
1477 btrfs_set_file_extent_encryption(leaf, fi, encryption);
1478 btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
1479 btrfs_mark_buffer_dirty(leaf);
1480
1481 inode_add_bytes(inode, num_bytes);
1482 btrfs_drop_extent_cache(inode, file_pos, file_pos + num_bytes - 1, 0);
1483
1484 ins.objectid = disk_bytenr;
1485 ins.offset = disk_num_bytes;
1486 ins.type = BTRFS_EXTENT_ITEM_KEY;
1487 ret = btrfs_alloc_reserved_extent(trans, root, leaf->start,
1488 root->root_key.objectid,
1489 trans->transid, inode->i_ino, &ins);
1490 BUG_ON(ret);
1491
1492 btrfs_free_path(path);
1493 return 0;
1494}
1495
1496/* as ordered data IO finishes, this gets called so we can finish
1497 * an ordered extent if the range of bytes in the file it covers are
1498 * fully written.
1499 */
1500static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1501{
1502 struct btrfs_root *root = BTRFS_I(inode)->root;
1503 struct btrfs_trans_handle *trans;
1504 struct btrfs_ordered_extent *ordered_extent;
1505 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1506 int compressed = 0;
1507 int ret;
1508
1509 ret = btrfs_dec_test_ordered_pending(inode, start, end - start + 1);
1510 if (!ret)
1511 return 0;
1512
1513 trans = btrfs_join_transaction(root, 1);
1514
1515 ordered_extent = btrfs_lookup_ordered_extent(inode, start);
1516 BUG_ON(!ordered_extent);
1517 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags))
1518 goto nocow;
1519
1520 lock_extent(io_tree, ordered_extent->file_offset,
1521 ordered_extent->file_offset + ordered_extent->len - 1,
1522 GFP_NOFS);
1523
1524 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
1525 compressed = 1;
1526 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
1527 BUG_ON(compressed);
1528 ret = btrfs_mark_extent_written(trans, root, inode,
1529 ordered_extent->file_offset,
1530 ordered_extent->file_offset +
1531 ordered_extent->len);
1532 BUG_ON(ret);
1533 } else {
1534 ret = insert_reserved_file_extent(trans, inode,
1535 ordered_extent->file_offset,
1536 ordered_extent->start,
1537 ordered_extent->disk_len,
1538 ordered_extent->len,
1539 ordered_extent->len,
1540 compressed, 0, 0,
1541 BTRFS_FILE_EXTENT_REG);
1542 BUG_ON(ret);
1543 }
1544 unlock_extent(io_tree, ordered_extent->file_offset,
1545 ordered_extent->file_offset + ordered_extent->len - 1,
1546 GFP_NOFS);
1547nocow:
1548 add_pending_csums(trans, inode, ordered_extent->file_offset,
1549 &ordered_extent->list);
1550
1551 mutex_lock(&BTRFS_I(inode)->extent_mutex);
1552 btrfs_ordered_update_i_size(inode, ordered_extent);
1553 btrfs_update_inode(trans, root, inode);
1554 btrfs_remove_ordered_extent(inode, ordered_extent);
1555 mutex_unlock(&BTRFS_I(inode)->extent_mutex);
1556
1557 /* once for us */
1558 btrfs_put_ordered_extent(ordered_extent);
1559 /* once for the tree */
1560 btrfs_put_ordered_extent(ordered_extent);
1561
1562 btrfs_end_transaction(trans, root);
1563 return 0;
1564}
1565
1566static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
1567 struct extent_state *state, int uptodate)
1568{
1569 return btrfs_finish_ordered_io(page->mapping->host, start, end);
1570}
1571
1572/*
1573 * When IO fails, either with EIO or csum verification fails, we
1574 * try other mirrors that might have a good copy of the data. This
1575 * io_failure_record is used to record state as we go through all the
1576 * mirrors. If another mirror has good data, the page is set up to date
1577 * and things continue. If a good mirror can't be found, the original
1578 * bio end_io callback is called to indicate things have failed.
1579 */
1580struct io_failure_record {
1581 struct page *page;
1582 u64 start;
1583 u64 len;
1584 u64 logical;
1585 unsigned long bio_flags;
1586 int last_mirror;
1587};
1588
1589static int btrfs_io_failed_hook(struct bio *failed_bio,
1590 struct page *page, u64 start, u64 end,
1591 struct extent_state *state)
1592{
1593 struct io_failure_record *failrec = NULL;
1594 u64 private;
1595 struct extent_map *em;
1596 struct inode *inode = page->mapping->host;
1597 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1598 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
1599 struct bio *bio;
1600 int num_copies;
1601 int ret;
1602 int rw;
1603 u64 logical;
1604
1605 ret = get_state_private(failure_tree, start, &private);
1606 if (ret) {
1607 failrec = kmalloc(sizeof(*failrec), GFP_NOFS);
1608 if (!failrec)
1609 return -ENOMEM;
1610 failrec->start = start;
1611 failrec->len = end - start + 1;
1612 failrec->last_mirror = 0;
1613 failrec->bio_flags = 0;
1614
1615 spin_lock(&em_tree->lock);
1616 em = lookup_extent_mapping(em_tree, start, failrec->len);
1617 if (em->start > start || em->start + em->len < start) {
1618 free_extent_map(em);
1619 em = NULL;
1620 }
1621 spin_unlock(&em_tree->lock);
1622
1623 if (!em || IS_ERR(em)) {
1624 kfree(failrec);
1625 return -EIO;
1626 }
1627 logical = start - em->start;
1628 logical = em->block_start + logical;
1629 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
1630 logical = em->block_start;
1631 failrec->bio_flags = EXTENT_BIO_COMPRESSED;
1632 }
1633 failrec->logical = logical;
1634 free_extent_map(em);
1635 set_extent_bits(failure_tree, start, end, EXTENT_LOCKED |
1636 EXTENT_DIRTY, GFP_NOFS);
1637 set_state_private(failure_tree, start,
1638 (u64)(unsigned long)failrec);
1639 } else {
1640 failrec = (struct io_failure_record *)(unsigned long)private;
1641 }
1642 num_copies = btrfs_num_copies(
1643 &BTRFS_I(inode)->root->fs_info->mapping_tree,
1644 failrec->logical, failrec->len);
1645 failrec->last_mirror++;
1646 if (!state) {
1647 spin_lock(&BTRFS_I(inode)->io_tree.lock);
1648 state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
1649 failrec->start,
1650 EXTENT_LOCKED);
1651 if (state && state->start != failrec->start)
1652 state = NULL;
1653 spin_unlock(&BTRFS_I(inode)->io_tree.lock);
1654 }
1655 if (!state || failrec->last_mirror > num_copies) {
1656 set_state_private(failure_tree, failrec->start, 0);
1657 clear_extent_bits(failure_tree, failrec->start,
1658 failrec->start + failrec->len - 1,
1659 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
1660 kfree(failrec);
1661 return -EIO;
1662 }
1663 bio = bio_alloc(GFP_NOFS, 1);
1664 bio->bi_private = state;
1665 bio->bi_end_io = failed_bio->bi_end_io;
1666 bio->bi_sector = failrec->logical >> 9;
1667 bio->bi_bdev = failed_bio->bi_bdev;
1668 bio->bi_size = 0;
1669
1670 bio_add_page(bio, page, failrec->len, start - page_offset(page));
1671 if (failed_bio->bi_rw & (1 << BIO_RW))
1672 rw = WRITE;
1673 else
1674 rw = READ;
1675
1676 BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
1677 failrec->last_mirror,
1678 failrec->bio_flags);
1679 return 0;
1680}
1681
1682/*
1683 * each time an IO finishes, we do a fast check in the IO failure tree
1684 * to see if we need to process or clean up an io_failure_record
1685 */
1686static int btrfs_clean_io_failures(struct inode *inode, u64 start)
1687{
1688 u64 private;
1689 u64 private_failure;
1690 struct io_failure_record *failure;
1691 int ret;
1692
1693 private = 0;
1694 if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
1695 (u64)-1, 1, EXTENT_DIRTY)) {
1696 ret = get_state_private(&BTRFS_I(inode)->io_failure_tree,
1697 start, &private_failure);
1698 if (ret == 0) {
1699 failure = (struct io_failure_record *)(unsigned long)
1700 private_failure;
1701 set_state_private(&BTRFS_I(inode)->io_failure_tree,
1702 failure->start, 0);
1703 clear_extent_bits(&BTRFS_I(inode)->io_failure_tree,
1704 failure->start,
1705 failure->start + failure->len - 1,
1706 EXTENT_DIRTY | EXTENT_LOCKED,
1707 GFP_NOFS);
1708 kfree(failure);
1709 }
1710 }
1711 return 0;
1712}
1713
1714/*
1715 * when reads are done, we need to check csums to verify the data is correct
1716 * if there's a match, we allow the bio to finish. If not, we go through
1717 * the io_failure_record routines to find good copies
1718 */
1719static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
1720 struct extent_state *state)
1721{
1722 size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT);
1723 struct inode *inode = page->mapping->host;
1724 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1725 char *kaddr;
1726 u64 private = ~(u32)0;
1727 int ret;
1728 struct btrfs_root *root = BTRFS_I(inode)->root;
1729 u32 csum = ~(u32)0;
1730
1731 if (PageChecked(page)) {
1732 ClearPageChecked(page);
1733 goto good;
1734 }
1735 if (btrfs_test_flag(inode, NODATASUM))
1736 return 0;
1737
1738 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
1739 test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1)) {
1740 clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM,
1741 GFP_NOFS);
1742 return 0;
1743 }
1744
1745 if (state && state->start == start) {
1746 private = state->private;
1747 ret = 0;
1748 } else {
1749 ret = get_state_private(io_tree, start, &private);
1750 }
1751 kaddr = kmap_atomic(page, KM_USER0);
1752 if (ret)
1753 goto zeroit;
1754
1755 csum = btrfs_csum_data(root, kaddr + offset, csum, end - start + 1);
1756 btrfs_csum_final(csum, (char *)&csum);
1757 if (csum != private)
1758 goto zeroit;
1759
1760 kunmap_atomic(kaddr, KM_USER0);
1761good:
1762 /* if the io failure tree for this inode is non-empty,
1763 * check to see if we've recovered from a failed IO
1764 */
1765 btrfs_clean_io_failures(inode, start);
1766 return 0;
1767
1768zeroit:
1769 printk(KERN_INFO "btrfs csum failed ino %lu off %llu csum %u "
1770 "private %llu\n", page->mapping->host->i_ino,
1771 (unsigned long long)start, csum,
1772 (unsigned long long)private);
1773 memset(kaddr + offset, 1, end - start + 1);
1774 flush_dcache_page(page);
1775 kunmap_atomic(kaddr, KM_USER0);
1776 if (private == 0)
1777 return 0;
1778 return -EIO;
1779}
1780
1781/*
1782 * This creates an orphan entry for the given inode in case something goes
1783 * wrong in the middle of an unlink/truncate.
1784 */
1785int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
1786{
1787 struct btrfs_root *root = BTRFS_I(inode)->root;
1788 int ret = 0;
1789
1790 spin_lock(&root->list_lock);
1791
1792 /* already on the orphan list, we're good */
1793 if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
1794 spin_unlock(&root->list_lock);
1795 return 0;
1796 }
1797
1798 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
1799
1800 spin_unlock(&root->list_lock);
1801
1802 /*
1803 * insert an orphan item to track this unlinked/truncated file
1804 */
1805 ret = btrfs_insert_orphan_item(trans, root, inode->i_ino);
1806
1807 return ret;
1808}
1809
1810/*
1811 * We have done the truncate/delete so we can go ahead and remove the orphan
1812 * item for this particular inode.
1813 */
1814int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
1815{
1816 struct btrfs_root *root = BTRFS_I(inode)->root;
1817 int ret = 0;
1818
1819 spin_lock(&root->list_lock);
1820
1821 if (list_empty(&BTRFS_I(inode)->i_orphan)) {
1822 spin_unlock(&root->list_lock);
1823 return 0;
1824 }
1825
1826 list_del_init(&BTRFS_I(inode)->i_orphan);
1827 if (!trans) {
1828 spin_unlock(&root->list_lock);
1829 return 0;
1830 }
1831
1832 spin_unlock(&root->list_lock);
1833
1834 ret = btrfs_del_orphan_item(trans, root, inode->i_ino);
1835
1836 return ret;
1837}
1838
1839/*
1840 * this cleans up any orphans that may be left on the list from the last use
1841 * of this root.
1842 */
1843void btrfs_orphan_cleanup(struct btrfs_root *root)
1844{
1845 struct btrfs_path *path;
1846 struct extent_buffer *leaf;
1847 struct btrfs_item *item;
1848 struct btrfs_key key, found_key;
1849 struct btrfs_trans_handle *trans;
1850 struct inode *inode;
1851 int ret = 0, nr_unlink = 0, nr_truncate = 0;
1852
1853 path = btrfs_alloc_path();
1854 if (!path)
1855 return;
1856 path->reada = -1;
1857
1858 key.objectid = BTRFS_ORPHAN_OBJECTID;
1859 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
1860 key.offset = (u64)-1;
1861
1862
1863 while (1) {
1864 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1865 if (ret < 0) {
1866 printk(KERN_ERR "Error searching slot for orphan: %d"
1867 "\n", ret);
1868 break;
1869 }
1870
1871 /*
1872 * if ret == 0 means we found what we were searching for, which
1873 * is weird, but possible, so only screw with path if we didnt
1874 * find the key and see if we have stuff that matches
1875 */
1876 if (ret > 0) {
1877 if (path->slots[0] == 0)
1878 break;
1879 path->slots[0]--;
1880 }
1881
1882 /* pull out the item */
1883 leaf = path->nodes[0];
1884 item = btrfs_item_nr(leaf, path->slots[0]);
1885 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1886
1887 /* make sure the item matches what we want */
1888 if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
1889 break;
1890 if (btrfs_key_type(&found_key) != BTRFS_ORPHAN_ITEM_KEY)
1891 break;
1892
1893 /* release the path since we're done with it */
1894 btrfs_release_path(root, path);
1895
1896 /*
1897 * this is where we are basically btrfs_lookup, without the
1898 * crossing root thing. we store the inode number in the
1899 * offset of the orphan item.
1900 */
1901 inode = btrfs_iget_locked(root->fs_info->sb,
1902 found_key.offset, root);
1903 if (!inode)
1904 break;
1905
1906 if (inode->i_state & I_NEW) {
1907 BTRFS_I(inode)->root = root;
1908
1909 /* have to set the location manually */
1910 BTRFS_I(inode)->location.objectid = inode->i_ino;
1911 BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
1912 BTRFS_I(inode)->location.offset = 0;
1913
1914 btrfs_read_locked_inode(inode);
1915 unlock_new_inode(inode);
1916 }
1917
1918 /*
1919 * add this inode to the orphan list so btrfs_orphan_del does
1920 * the proper thing when we hit it
1921 */
1922 spin_lock(&root->list_lock);
1923 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
1924 spin_unlock(&root->list_lock);
1925
1926 /*
1927 * if this is a bad inode, means we actually succeeded in
1928 * removing the inode, but not the orphan record, which means
1929 * we need to manually delete the orphan since iput will just
1930 * do a destroy_inode
1931 */
1932 if (is_bad_inode(inode)) {
1933 trans = btrfs_start_transaction(root, 1);
1934 btrfs_orphan_del(trans, inode);
1935 btrfs_end_transaction(trans, root);
1936 iput(inode);
1937 continue;
1938 }
1939
1940 /* if we have links, this was a truncate, lets do that */
1941 if (inode->i_nlink) {
1942 nr_truncate++;
1943 btrfs_truncate(inode);
1944 } else {
1945 nr_unlink++;
1946 }
1947
1948 /* this will do delete_inode and everything for us */
1949 iput(inode);
1950 }
1951
1952 if (nr_unlink)
1953 printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink);
1954 if (nr_truncate)
1955 printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate);
1956
1957 btrfs_free_path(path);
1958}
1959
1960/*
1961 * read an inode from the btree into the in-memory inode
1962 */
1963void btrfs_read_locked_inode(struct inode *inode)
1964{
1965 struct btrfs_path *path;
1966 struct extent_buffer *leaf;
1967 struct btrfs_inode_item *inode_item;
1968 struct btrfs_timespec *tspec;
1969 struct btrfs_root *root = BTRFS_I(inode)->root;
1970 struct btrfs_key location;
1971 u64 alloc_group_block;
1972 u32 rdev;
1973 int ret;
1974
1975 path = btrfs_alloc_path();
1976 BUG_ON(!path);
1977 memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
1978
1979 ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
1980 if (ret)
1981 goto make_bad;
1982
1983 leaf = path->nodes[0];
1984 inode_item = btrfs_item_ptr(leaf, path->slots[0],
1985 struct btrfs_inode_item);
1986
1987 inode->i_mode = btrfs_inode_mode(leaf, inode_item);
1988 inode->i_nlink = btrfs_inode_nlink(leaf, inode_item);
1989 inode->i_uid = btrfs_inode_uid(leaf, inode_item);
1990 inode->i_gid = btrfs_inode_gid(leaf, inode_item);
1991 btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
1992
1993 tspec = btrfs_inode_atime(inode_item);
1994 inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, tspec);
1995 inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
1996
1997 tspec = btrfs_inode_mtime(inode_item);
1998 inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, tspec);
1999 inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
2000
2001 tspec = btrfs_inode_ctime(inode_item);
2002 inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, tspec);
2003 inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
2004
2005 inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
2006 BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
2007 BTRFS_I(inode)->sequence = btrfs_inode_sequence(leaf, inode_item);
2008 inode->i_generation = BTRFS_I(inode)->generation;
2009 inode->i_rdev = 0;
2010 rdev = btrfs_inode_rdev(leaf, inode_item);
2011
2012 BTRFS_I(inode)->index_cnt = (u64)-1;
2013 BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
2014
2015 alloc_group_block = btrfs_inode_block_group(leaf, inode_item);
2016 BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0,
2017 alloc_group_block, 0);
2018 btrfs_free_path(path);
2019 inode_item = NULL;
2020
2021 switch (inode->i_mode & S_IFMT) {
2022 case S_IFREG:
2023 inode->i_mapping->a_ops = &btrfs_aops;
2024 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
2025 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
2026 inode->i_fop = &btrfs_file_operations;
2027 inode->i_op = &btrfs_file_inode_operations;
2028 break;
2029 case S_IFDIR:
2030 inode->i_fop = &btrfs_dir_file_operations;
2031 if (root == root->fs_info->tree_root)
2032 inode->i_op = &btrfs_dir_ro_inode_operations;
2033 else
2034 inode->i_op = &btrfs_dir_inode_operations;
2035 break;
2036 case S_IFLNK:
2037 inode->i_op = &btrfs_symlink_inode_operations;
2038 inode->i_mapping->a_ops = &btrfs_symlink_aops;
2039 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
2040 break;
2041 default:
2042 init_special_inode(inode, inode->i_mode, rdev);
2043 break;
2044 }
2045 return;
2046
2047make_bad:
2048 btrfs_free_path(path);
2049 make_bad_inode(inode);
2050}
2051
2052/*
2053 * given a leaf and an inode, copy the inode fields into the leaf
2054 */
2055static void fill_inode_item(struct btrfs_trans_handle *trans,
2056 struct extent_buffer *leaf,
2057 struct btrfs_inode_item *item,
2058 struct inode *inode)
2059{
2060 btrfs_set_inode_uid(leaf, item, inode->i_uid);
2061 btrfs_set_inode_gid(leaf, item, inode->i_gid);
2062 btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size);
2063 btrfs_set_inode_mode(leaf, item, inode->i_mode);
2064 btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
2065
2066 btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item),
2067 inode->i_atime.tv_sec);
2068 btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item),
2069 inode->i_atime.tv_nsec);
2070
2071 btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item),
2072 inode->i_mtime.tv_sec);
2073 btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item),
2074 inode->i_mtime.tv_nsec);
2075
2076 btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item),
2077 inode->i_ctime.tv_sec);
2078 btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item),
2079 inode->i_ctime.tv_nsec);
2080
2081 btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
2082 btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation);
2083 btrfs_set_inode_sequence(leaf, item, BTRFS_I(inode)->sequence);
2084 btrfs_set_inode_transid(leaf, item, trans->transid);
2085 btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
2086 btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
2087 btrfs_set_inode_block_group(leaf, item, BTRFS_I(inode)->block_group);
2088}
2089
2090/*
2091 * copy everything in the in-memory inode into the btree.
2092 */
2093noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
2094 struct btrfs_root *root, struct inode *inode)
2095{
2096 struct btrfs_inode_item *inode_item;
2097 struct btrfs_path *path;
2098 struct extent_buffer *leaf;
2099 int ret;
2100
2101 path = btrfs_alloc_path();
2102 BUG_ON(!path);
2103 ret = btrfs_lookup_inode(trans, root, path,
2104 &BTRFS_I(inode)->location, 1);
2105 if (ret) {
2106 if (ret > 0)
2107 ret = -ENOENT;
2108 goto failed;
2109 }
2110
2111 leaf = path->nodes[0];
2112 inode_item = btrfs_item_ptr(leaf, path->slots[0],
2113 struct btrfs_inode_item);
2114
2115 fill_inode_item(trans, leaf, inode_item, inode);
2116 btrfs_mark_buffer_dirty(leaf);
2117 btrfs_set_inode_last_trans(trans, inode);
2118 ret = 0;
2119failed:
2120 btrfs_free_path(path);
2121 return ret;
2122}
2123
2124
2125/*
2126 * unlink helper that gets used here in inode.c and in the tree logging
2127 * recovery code. It remove a link in a directory with a given name, and
2128 * also drops the back refs in the inode to the directory
2129 */
2130int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
2131 struct btrfs_root *root,
2132 struct inode *dir, struct inode *inode,
2133 const char *name, int name_len)
2134{
2135 struct btrfs_path *path;
2136 int ret = 0;
2137 struct extent_buffer *leaf;
2138 struct btrfs_dir_item *di;
2139 struct btrfs_key key;
2140 u64 index;
2141
2142 path = btrfs_alloc_path();
2143 if (!path) {
2144 ret = -ENOMEM;
2145 goto err;
2146 }
2147
2148 di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
2149 name, name_len, -1);
2150 if (IS_ERR(di)) {
2151 ret = PTR_ERR(di);
2152 goto err;
2153 }
2154 if (!di) {
2155 ret = -ENOENT;
2156 goto err;
2157 }
2158 leaf = path->nodes[0];
2159 btrfs_dir_item_key_to_cpu(leaf, di, &key);
2160 ret = btrfs_delete_one_dir_name(trans, root, path, di);
2161 if (ret)
2162 goto err;
2163 btrfs_release_path(root, path);
2164
2165 ret = btrfs_del_inode_ref(trans, root, name, name_len,
2166 inode->i_ino,
2167 dir->i_ino, &index);
2168 if (ret) {
2169 printk(KERN_INFO "btrfs failed to delete reference to %.*s, "
2170 "inode %lu parent %lu\n", name_len, name,
2171 inode->i_ino, dir->i_ino);
2172 goto err;
2173 }
2174
2175 di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
2176 index, name, name_len, -1);
2177 if (IS_ERR(di)) {
2178 ret = PTR_ERR(di);
2179 goto err;
2180 }
2181 if (!di) {
2182 ret = -ENOENT;
2183 goto err;
2184 }
2185 ret = btrfs_delete_one_dir_name(trans, root, path, di);
2186 btrfs_release_path(root, path);
2187
2188 ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
2189 inode, dir->i_ino);
2190 BUG_ON(ret != 0 && ret != -ENOENT);
2191 if (ret != -ENOENT)
2192 BTRFS_I(dir)->log_dirty_trans = trans->transid;
2193
2194 ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
2195 dir, index);
2196 BUG_ON(ret);
2197err:
2198 btrfs_free_path(path);
2199 if (ret)
2200 goto out;
2201
2202 btrfs_i_size_write(dir, dir->i_size - name_len * 2);
2203 inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME;
2204 btrfs_update_inode(trans, root, dir);
2205 btrfs_drop_nlink(inode);
2206 ret = btrfs_update_inode(trans, root, inode);
2207 dir->i_sb->s_dirt = 1;
2208out:
2209 return ret;
2210}
2211
2212static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
2213{
2214 struct btrfs_root *root;
2215 struct btrfs_trans_handle *trans;
2216 struct inode *inode = dentry->d_inode;
2217 int ret;
2218 unsigned long nr = 0;
2219
2220 root = BTRFS_I(dir)->root;
2221
2222 ret = btrfs_check_free_space(root, 1, 1);
2223 if (ret)
2224 goto fail;
2225
2226 trans = btrfs_start_transaction(root, 1);
2227
2228 btrfs_set_trans_block_group(trans, dir);
2229 ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
2230 dentry->d_name.name, dentry->d_name.len);
2231
2232 if (inode->i_nlink == 0)
2233 ret = btrfs_orphan_add(trans, inode);
2234
2235 nr = trans->blocks_used;
2236
2237 btrfs_end_transaction_throttle(trans, root);
2238fail:
2239 btrfs_btree_balance_dirty(root, nr);
2240 return ret;
2241}
2242
2243static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
2244{
2245 struct inode *inode = dentry->d_inode;
2246 int err = 0;
2247 int ret;
2248 struct btrfs_root *root = BTRFS_I(dir)->root;
2249 struct btrfs_trans_handle *trans;
2250 unsigned long nr = 0;
2251
2252 /*
2253 * the FIRST_FREE_OBJECTID check makes sure we don't try to rmdir
2254 * the root of a subvolume or snapshot
2255 */
2256 if (inode->i_size > BTRFS_EMPTY_DIR_SIZE ||
2257 inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
2258 return -ENOTEMPTY;
2259 }
2260
2261 ret = btrfs_check_free_space(root, 1, 1);
2262 if (ret)
2263 goto fail;
2264
2265 trans = btrfs_start_transaction(root, 1);
2266 btrfs_set_trans_block_group(trans, dir);
2267
2268 err = btrfs_orphan_add(trans, inode);
2269 if (err)
2270 goto fail_trans;
2271
2272 /* now the directory is empty */
2273 err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
2274 dentry->d_name.name, dentry->d_name.len);
2275 if (!err)
2276 btrfs_i_size_write(inode, 0);
2277
2278fail_trans:
2279 nr = trans->blocks_used;
2280 ret = btrfs_end_transaction_throttle(trans, root);
2281fail:
2282 btrfs_btree_balance_dirty(root, nr);
2283
2284 if (ret && !err)
2285 err = ret;
2286 return err;
2287}
2288
2289#if 0
2290/*
2291 * when truncating bytes in a file, it is possible to avoid reading
2292 * the leaves that contain only checksum items. This can be the
2293 * majority of the IO required to delete a large file, but it must
2294 * be done carefully.
2295 *
2296 * The keys in the level just above the leaves are checked to make sure
2297 * the lowest key in a given leaf is a csum key, and starts at an offset
2298 * after the new size.
2299 *
2300 * Then the key for the next leaf is checked to make sure it also has
2301 * a checksum item for the same file. If it does, we know our target leaf
2302 * contains only checksum items, and it can be safely freed without reading
2303 * it.
2304 *
2305 * This is just an optimization targeted at large files. It may do
2306 * nothing. It will return 0 unless things went badly.
2307 */
2308static noinline int drop_csum_leaves(struct btrfs_trans_handle *trans,
2309 struct btrfs_root *root,
2310 struct btrfs_path *path,
2311 struct inode *inode, u64 new_size)
2312{
2313 struct btrfs_key key;
2314 int ret;
2315 int nritems;
2316 struct btrfs_key found_key;
2317 struct btrfs_key other_key;
2318 struct btrfs_leaf_ref *ref;
2319 u64 leaf_gen;
2320 u64 leaf_start;
2321
2322 path->lowest_level = 1;
2323 key.objectid = inode->i_ino;
2324 key.type = BTRFS_CSUM_ITEM_KEY;
2325 key.offset = new_size;
2326again:
2327 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2328 if (ret < 0)
2329 goto out;
2330
2331 if (path->nodes[1] == NULL) {
2332 ret = 0;
2333 goto out;
2334 }
2335 ret = 0;
2336 btrfs_node_key_to_cpu(path->nodes[1], &found_key, path->slots[1]);
2337 nritems = btrfs_header_nritems(path->nodes[1]);
2338
2339 if (!nritems)
2340 goto out;
2341
2342 if (path->slots[1] >= nritems)
2343 goto next_node;
2344
2345 /* did we find a key greater than anything we want to delete? */
2346 if (found_key.objectid > inode->i_ino ||
2347 (found_key.objectid == inode->i_ino && found_key.type > key.type))
2348 goto out;
2349
2350 /* we check the next key in the node to make sure the leave contains
2351 * only checksum items. This comparison doesn't work if our
2352 * leaf is the last one in the node
2353 */
2354 if (path->slots[1] + 1 >= nritems) {
2355next_node:
2356 /* search forward from the last key in the node, this
2357 * will bring us into the next node in the tree
2358 */
2359 btrfs_node_key_to_cpu(path->nodes[1], &found_key, nritems - 1);
2360
2361 /* unlikely, but we inc below, so check to be safe */
2362 if (found_key.offset == (u64)-1)
2363 goto out;
2364
2365 /* search_forward needs a path with locks held, do the
2366 * search again for the original key. It is possible
2367 * this will race with a balance and return a path that
2368 * we could modify, but this drop is just an optimization
2369 * and is allowed to miss some leaves.
2370 */
2371 btrfs_release_path(root, path);
2372 found_key.offset++;
2373
2374 /* setup a max key for search_forward */
2375 other_key.offset = (u64)-1;
2376 other_key.type = key.type;
2377 other_key.objectid = key.objectid;
2378
2379 path->keep_locks = 1;
2380 ret = btrfs_search_forward(root, &found_key, &other_key,
2381 path, 0, 0);
2382 path->keep_locks = 0;
2383 if (ret || found_key.objectid != key.objectid ||
2384 found_key.type != key.type) {
2385 ret = 0;
2386 goto out;
2387 }
2388
2389 key.offset = found_key.offset;
2390 btrfs_release_path(root, path);
2391 cond_resched();
2392 goto again;
2393 }
2394
2395 /* we know there's one more slot after us in the tree,
2396 * read that key so we can verify it is also a checksum item
2397 */
2398 btrfs_node_key_to_cpu(path->nodes[1], &other_key, path->slots[1] + 1);
2399
2400 if (found_key.objectid < inode->i_ino)
2401 goto next_key;
2402
2403 if (found_key.type != key.type || found_key.offset < new_size)
2404 goto next_key;
2405
2406 /*
2407 * if the key for the next leaf isn't a csum key from this objectid,
2408 * we can't be sure there aren't good items inside this leaf.
2409 * Bail out
2410 */
2411 if (other_key.objectid != inode->i_ino || other_key.type != key.type)
2412 goto out;
2413
2414 leaf_start = btrfs_node_blockptr(path->nodes[1], path->slots[1]);
2415 leaf_gen = btrfs_node_ptr_generation(path->nodes[1], path->slots[1]);
2416 /*
2417 * it is safe to delete this leaf, it contains only
2418 * csum items from this inode at an offset >= new_size
2419 */
2420 ret = btrfs_del_leaf(trans, root, path, leaf_start);
2421 BUG_ON(ret);
2422
2423 if (root->ref_cows && leaf_gen < trans->transid) {
2424 ref = btrfs_alloc_leaf_ref(root, 0);
2425 if (ref) {
2426 ref->root_gen = root->root_key.offset;
2427 ref->bytenr = leaf_start;
2428 ref->owner = 0;
2429 ref->generation = leaf_gen;
2430 ref->nritems = 0;
2431
2432 ret = btrfs_add_leaf_ref(root, ref, 0);
2433 WARN_ON(ret);
2434 btrfs_free_leaf_ref(root, ref);
2435 } else {
2436 WARN_ON(1);
2437 }
2438 }
2439next_key:
2440 btrfs_release_path(root, path);
2441
2442 if (other_key.objectid == inode->i_ino &&
2443 other_key.type == key.type && other_key.offset > key.offset) {
2444 key.offset = other_key.offset;
2445 cond_resched();
2446 goto again;
2447 }
2448 ret = 0;
2449out:
2450 /* fixup any changes we've made to the path */
2451 path->lowest_level = 0;
2452 path->keep_locks = 0;
2453 btrfs_release_path(root, path);
2454 return ret;
2455}
2456
2457#endif
2458
2459/*
2460 * this can truncate away extent items, csum items and directory items.
2461 * It starts at a high offset and removes keys until it can't find
2462 * any higher than new_size
2463 *
2464 * csum items that cross the new i_size are truncated to the new size
2465 * as well.
2466 *
2467 * min_type is the minimum key type to truncate down to. If set to 0, this
2468 * will kill all the items on this inode, including the INODE_ITEM_KEY.
2469 */
2470noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
2471 struct btrfs_root *root,
2472 struct inode *inode,
2473 u64 new_size, u32 min_type)
2474{
2475 int ret;
2476 struct btrfs_path *path;
2477 struct btrfs_key key;
2478 struct btrfs_key found_key;
2479 u32 found_type;
2480 struct extent_buffer *leaf;
2481 struct btrfs_file_extent_item *fi;
2482 u64 extent_start = 0;
2483 u64 extent_num_bytes = 0;
2484 u64 item_end = 0;
2485 u64 root_gen = 0;
2486 u64 root_owner = 0;
2487 int found_extent;
2488 int del_item;
2489 int pending_del_nr = 0;
2490 int pending_del_slot = 0;
2491 int extent_type = -1;
2492 int encoding;
2493 u64 mask = root->sectorsize - 1;
2494
2495 if (root->ref_cows)
2496 btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
2497 path = btrfs_alloc_path();
2498 path->reada = -1;
2499 BUG_ON(!path);
2500
2501 /* FIXME, add redo link to tree so we don't leak on crash */
2502 key.objectid = inode->i_ino;
2503 key.offset = (u64)-1;
2504 key.type = (u8)-1;
2505
2506 btrfs_init_path(path);
2507
2508search_again:
2509 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2510 if (ret < 0)
2511 goto error;
2512
2513 if (ret > 0) {
2514 /* there are no items in the tree for us to truncate, we're
2515 * done
2516 */
2517 if (path->slots[0] == 0) {
2518 ret = 0;
2519 goto error;
2520 }
2521 path->slots[0]--;
2522 }
2523
2524 while (1) {
2525 fi = NULL;
2526 leaf = path->nodes[0];
2527 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2528 found_type = btrfs_key_type(&found_key);
2529 encoding = 0;
2530
2531 if (found_key.objectid != inode->i_ino)
2532 break;
2533
2534 if (found_type < min_type)
2535 break;
2536
2537 item_end = found_key.offset;
2538 if (found_type == BTRFS_EXTENT_DATA_KEY) {
2539 fi = btrfs_item_ptr(leaf, path->slots[0],
2540 struct btrfs_file_extent_item);
2541 extent_type = btrfs_file_extent_type(leaf, fi);
2542 encoding = btrfs_file_extent_compression(leaf, fi);
2543 encoding |= btrfs_file_extent_encryption(leaf, fi);
2544 encoding |= btrfs_file_extent_other_encoding(leaf, fi);
2545
2546 if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
2547 item_end +=
2548 btrfs_file_extent_num_bytes(leaf, fi);
2549 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
2550 item_end += btrfs_file_extent_inline_len(leaf,
2551 fi);
2552 }
2553 item_end--;
2554 }
2555 if (item_end < new_size) {
2556 if (found_type == BTRFS_DIR_ITEM_KEY)
2557 found_type = BTRFS_INODE_ITEM_KEY;
2558 else if (found_type == BTRFS_EXTENT_ITEM_KEY)
2559 found_type = BTRFS_EXTENT_DATA_KEY;
2560 else if (found_type == BTRFS_EXTENT_DATA_KEY)
2561 found_type = BTRFS_XATTR_ITEM_KEY;
2562 else if (found_type == BTRFS_XATTR_ITEM_KEY)
2563 found_type = BTRFS_INODE_REF_KEY;
2564 else if (found_type)
2565 found_type--;
2566 else
2567 break;
2568 btrfs_set_key_type(&key, found_type);
2569 goto next;
2570 }
2571 if (found_key.offset >= new_size)
2572 del_item = 1;
2573 else
2574 del_item = 0;
2575 found_extent = 0;
2576
2577 /* FIXME, shrink the extent if the ref count is only 1 */
2578 if (found_type != BTRFS_EXTENT_DATA_KEY)
2579 goto delete;
2580
2581 if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
2582 u64 num_dec;
2583 extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
2584 if (!del_item && !encoding) {
2585 u64 orig_num_bytes =
2586 btrfs_file_extent_num_bytes(leaf, fi);
2587 extent_num_bytes = new_size -
2588 found_key.offset + root->sectorsize - 1;
2589 extent_num_bytes = extent_num_bytes &
2590 ~((u64)root->sectorsize - 1);
2591 btrfs_set_file_extent_num_bytes(leaf, fi,
2592 extent_num_bytes);
2593 num_dec = (orig_num_bytes -
2594 extent_num_bytes);
2595 if (root->ref_cows && extent_start != 0)
2596 inode_sub_bytes(inode, num_dec);
2597 btrfs_mark_buffer_dirty(leaf);
2598 } else {
2599 extent_num_bytes =
2600 btrfs_file_extent_disk_num_bytes(leaf,
2601 fi);
2602 /* FIXME blocksize != 4096 */
2603 num_dec = btrfs_file_extent_num_bytes(leaf, fi);
2604 if (extent_start != 0) {
2605 found_extent = 1;
2606 if (root->ref_cows)
2607 inode_sub_bytes(inode, num_dec);
2608 }
2609 root_gen = btrfs_header_generation(leaf);
2610 root_owner = btrfs_header_owner(leaf);
2611 }
2612 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
2613 /*
2614 * we can't truncate inline items that have had
2615 * special encodings
2616 */
2617 if (!del_item &&
2618 btrfs_file_extent_compression(leaf, fi) == 0 &&
2619 btrfs_file_extent_encryption(leaf, fi) == 0 &&
2620 btrfs_file_extent_other_encoding(leaf, fi) == 0) {
2621 u32 size = new_size - found_key.offset;
2622
2623 if (root->ref_cows) {
2624 inode_sub_bytes(inode, item_end + 1 -
2625 new_size);
2626 }
2627 size =
2628 btrfs_file_extent_calc_inline_size(size);
2629 ret = btrfs_truncate_item(trans, root, path,
2630 size, 1);
2631 BUG_ON(ret);
2632 } else if (root->ref_cows) {
2633 inode_sub_bytes(inode, item_end + 1 -
2634 found_key.offset);
2635 }
2636 }
2637delete:
2638 if (del_item) {
2639 if (!pending_del_nr) {
2640 /* no pending yet, add ourselves */
2641 pending_del_slot = path->slots[0];
2642 pending_del_nr = 1;
2643 } else if (pending_del_nr &&
2644 path->slots[0] + 1 == pending_del_slot) {
2645 /* hop on the pending chunk */
2646 pending_del_nr++;
2647 pending_del_slot = path->slots[0];
2648 } else {
2649 BUG();
2650 }
2651 } else {
2652 break;
2653 }
2654 if (found_extent) {
2655 ret = btrfs_free_extent(trans, root, extent_start,
2656 extent_num_bytes,
2657 leaf->start, root_owner,
2658 root_gen, inode->i_ino, 0);
2659 BUG_ON(ret);
2660 }
2661next:
2662 if (path->slots[0] == 0) {
2663 if (pending_del_nr)
2664 goto del_pending;
2665 btrfs_release_path(root, path);
2666 goto search_again;
2667 }
2668
2669 path->slots[0]--;
2670 if (pending_del_nr &&
2671 path->slots[0] + 1 != pending_del_slot) {
2672 struct btrfs_key debug;
2673del_pending:
2674 btrfs_item_key_to_cpu(path->nodes[0], &debug,
2675 pending_del_slot);
2676 ret = btrfs_del_items(trans, root, path,
2677 pending_del_slot,
2678 pending_del_nr);
2679 BUG_ON(ret);
2680 pending_del_nr = 0;
2681 btrfs_release_path(root, path);
2682 goto search_again;
2683 }
2684 }
2685 ret = 0;
2686error:
2687 if (pending_del_nr) {
2688 ret = btrfs_del_items(trans, root, path, pending_del_slot,
2689 pending_del_nr);
2690 }
2691 btrfs_free_path(path);
2692 inode->i_sb->s_dirt = 1;
2693 return ret;
2694}
2695
2696/*
2697 * taken from block_truncate_page, but does cow as it zeros out
2698 * any bytes left in the last page in the file.
2699 */
2700static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
2701{
2702 struct inode *inode = mapping->host;
2703 struct btrfs_root *root = BTRFS_I(inode)->root;
2704 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2705 struct btrfs_ordered_extent *ordered;
2706 char *kaddr;
2707 u32 blocksize = root->sectorsize;
2708 pgoff_t index = from >> PAGE_CACHE_SHIFT;
2709 unsigned offset = from & (PAGE_CACHE_SIZE-1);
2710 struct page *page;
2711 int ret = 0;
2712 u64 page_start;
2713 u64 page_end;
2714
2715 if ((offset & (blocksize - 1)) == 0)
2716 goto out;
2717
2718 ret = -ENOMEM;
2719again:
2720 page = grab_cache_page(mapping, index);
2721 if (!page)
2722 goto out;
2723
2724 page_start = page_offset(page);
2725 page_end = page_start + PAGE_CACHE_SIZE - 1;
2726
2727 if (!PageUptodate(page)) {
2728 ret = btrfs_readpage(NULL, page);
2729 lock_page(page);
2730 if (page->mapping != mapping) {
2731 unlock_page(page);
2732 page_cache_release(page);
2733 goto again;
2734 }
2735 if (!PageUptodate(page)) {
2736 ret = -EIO;
2737 goto out_unlock;
2738 }
2739 }
2740 wait_on_page_writeback(page);
2741
2742 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
2743 set_page_extent_mapped(page);
2744
2745 ordered = btrfs_lookup_ordered_extent(inode, page_start);
2746 if (ordered) {
2747 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
2748 unlock_page(page);
2749 page_cache_release(page);
2750 btrfs_start_ordered_extent(inode, ordered, 1);
2751 btrfs_put_ordered_extent(ordered);
2752 goto again;
2753 }
2754
2755 btrfs_set_extent_delalloc(inode, page_start, page_end);
2756 ret = 0;
2757 if (offset != PAGE_CACHE_SIZE) {
2758 kaddr = kmap(page);
2759 memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
2760 flush_dcache_page(page);
2761 kunmap(page);
2762 }
2763 ClearPageChecked(page);
2764 set_page_dirty(page);
2765 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
2766
2767out_unlock:
2768 unlock_page(page);
2769 page_cache_release(page);
2770out:
2771 return ret;
2772}
2773
2774int btrfs_cont_expand(struct inode *inode, loff_t size)
2775{
2776 struct btrfs_trans_handle *trans;
2777 struct btrfs_root *root = BTRFS_I(inode)->root;
2778 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2779 struct extent_map *em;
2780 u64 mask = root->sectorsize - 1;
2781 u64 hole_start = (inode->i_size + mask) & ~mask;
2782 u64 block_end = (size + mask) & ~mask;
2783 u64 last_byte;
2784 u64 cur_offset;
2785 u64 hole_size;
2786 int err;
2787
2788 if (size <= hole_start)
2789 return 0;
2790
2791 err = btrfs_check_free_space(root, 1, 0);
2792 if (err)
2793 return err;
2794
2795 btrfs_truncate_page(inode->i_mapping, inode->i_size);
2796
2797 while (1) {
2798 struct btrfs_ordered_extent *ordered;
2799 btrfs_wait_ordered_range(inode, hole_start,
2800 block_end - hole_start);
2801 lock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
2802 ordered = btrfs_lookup_ordered_extent(inode, hole_start);
2803 if (!ordered)
2804 break;
2805 unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
2806 btrfs_put_ordered_extent(ordered);
2807 }
2808
2809 trans = btrfs_start_transaction(root, 1);
2810 btrfs_set_trans_block_group(trans, inode);
2811
2812 cur_offset = hole_start;
2813 while (1) {
2814 em = btrfs_get_extent(inode, NULL, 0, cur_offset,
2815 block_end - cur_offset, 0);
2816 BUG_ON(IS_ERR(em) || !em);
2817 last_byte = min(extent_map_end(em), block_end);
2818 last_byte = (last_byte + mask) & ~mask;
2819 if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) {
2820 u64 hint_byte = 0;
2821 hole_size = last_byte - cur_offset;
2822 err = btrfs_drop_extents(trans, root, inode,
2823 cur_offset,
2824 cur_offset + hole_size,
2825 cur_offset, &hint_byte);
2826 if (err)
2827 break;
2828 err = btrfs_insert_file_extent(trans, root,
2829 inode->i_ino, cur_offset, 0,
2830 0, hole_size, 0, hole_size,
2831 0, 0, 0);
2832 btrfs_drop_extent_cache(inode, hole_start,
2833 last_byte - 1, 0);
2834 }
2835 free_extent_map(em);
2836 cur_offset = last_byte;
2837 if (err || cur_offset >= block_end)
2838 break;
2839 }
2840
2841 btrfs_end_transaction(trans, root);
2842 unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
2843 return err;
2844}
2845
2846static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
2847{
2848 struct inode *inode = dentry->d_inode;
2849 int err;
2850
2851 err = inode_change_ok(inode, attr);
2852 if (err)
2853 return err;
2854
2855 if (S_ISREG(inode->i_mode) &&
2856 attr->ia_valid & ATTR_SIZE && attr->ia_size > inode->i_size) {
2857 err = btrfs_cont_expand(inode, attr->ia_size);
2858 if (err)
2859 return err;
2860 }
2861
2862 err = inode_setattr(inode, attr);
2863
2864 if (!err && ((attr->ia_valid & ATTR_MODE)))
2865 err = btrfs_acl_chmod(inode);
2866 return err;
2867}
2868
2869void btrfs_delete_inode(struct inode *inode)
2870{
2871 struct btrfs_trans_handle *trans;
2872 struct btrfs_root *root = BTRFS_I(inode)->root;
2873 unsigned long nr;
2874 int ret;
2875
2876 truncate_inode_pages(&inode->i_data, 0);
2877 if (is_bad_inode(inode)) {
2878 btrfs_orphan_del(NULL, inode);
2879 goto no_delete;
2880 }
2881 btrfs_wait_ordered_range(inode, 0, (u64)-1);
2882
2883 btrfs_i_size_write(inode, 0);
2884 trans = btrfs_join_transaction(root, 1);
2885
2886 btrfs_set_trans_block_group(trans, inode);
2887 ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, 0);
2888 if (ret) {
2889 btrfs_orphan_del(NULL, inode);
2890 goto no_delete_lock;
2891 }
2892
2893 btrfs_orphan_del(trans, inode);
2894
2895 nr = trans->blocks_used;
2896 clear_inode(inode);
2897
2898 btrfs_end_transaction(trans, root);
2899 btrfs_btree_balance_dirty(root, nr);
2900 return;
2901
2902no_delete_lock:
2903 nr = trans->blocks_used;
2904 btrfs_end_transaction(trans, root);
2905 btrfs_btree_balance_dirty(root, nr);
2906no_delete:
2907 clear_inode(inode);
2908}
2909
2910/*
2911 * this returns the key found in the dir entry in the location pointer.
2912 * If no dir entries were found, location->objectid is 0.
2913 */
2914static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
2915 struct btrfs_key *location)
2916{
2917 const char *name = dentry->d_name.name;
2918 int namelen = dentry->d_name.len;
2919 struct btrfs_dir_item *di;
2920 struct btrfs_path *path;
2921 struct btrfs_root *root = BTRFS_I(dir)->root;
2922 int ret = 0;
2923
2924 path = btrfs_alloc_path();
2925 BUG_ON(!path);
2926
2927 di = btrfs_lookup_dir_item(NULL, root, path, dir->i_ino, name,
2928 namelen, 0);
2929 if (IS_ERR(di))
2930 ret = PTR_ERR(di);
2931
2932 if (!di || IS_ERR(di))
2933 goto out_err;
2934
2935 btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
2936out:
2937 btrfs_free_path(path);
2938 return ret;
2939out_err:
2940 location->objectid = 0;
2941 goto out;
2942}
2943
2944/*
2945 * when we hit a tree root in a directory, the btrfs part of the inode
2946 * needs to be changed to reflect the root directory of the tree root. This
2947 * is kind of like crossing a mount point.
2948 */
2949static int fixup_tree_root_location(struct btrfs_root *root,
2950 struct btrfs_key *location,
2951 struct btrfs_root **sub_root,
2952 struct dentry *dentry)
2953{
2954 struct btrfs_root_item *ri;
2955
2956 if (btrfs_key_type(location) != BTRFS_ROOT_ITEM_KEY)
2957 return 0;
2958 if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
2959 return 0;
2960
2961 *sub_root = btrfs_read_fs_root(root->fs_info, location,
2962 dentry->d_name.name,
2963 dentry->d_name.len);
2964 if (IS_ERR(*sub_root))
2965 return PTR_ERR(*sub_root);
2966
2967 ri = &(*sub_root)->root_item;
2968 location->objectid = btrfs_root_dirid(ri);
2969 btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
2970 location->offset = 0;
2971
2972 return 0;
2973}
2974
2975static noinline void init_btrfs_i(struct inode *inode)
2976{
2977 struct btrfs_inode *bi = BTRFS_I(inode);
2978
2979 bi->i_acl = NULL;
2980 bi->i_default_acl = NULL;
2981
2982 bi->generation = 0;
2983 bi->sequence = 0;
2984 bi->last_trans = 0;
2985 bi->logged_trans = 0;
2986 bi->delalloc_bytes = 0;
2987 bi->disk_i_size = 0;
2988 bi->flags = 0;
2989 bi->index_cnt = (u64)-1;
2990 bi->log_dirty_trans = 0;
2991 extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
2992 extent_io_tree_init(&BTRFS_I(inode)->io_tree,
2993 inode->i_mapping, GFP_NOFS);
2994 extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
2995 inode->i_mapping, GFP_NOFS);
2996 INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
2997 btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
2998 mutex_init(&BTRFS_I(inode)->extent_mutex);
2999 mutex_init(&BTRFS_I(inode)->log_mutex);
3000}
3001
3002static int btrfs_init_locked_inode(struct inode *inode, void *p)
3003{
3004 struct btrfs_iget_args *args = p;
3005 inode->i_ino = args->ino;
3006 init_btrfs_i(inode);
3007 BTRFS_I(inode)->root = args->root;
3008 return 0;
3009}
3010
3011static int btrfs_find_actor(struct inode *inode, void *opaque)
3012{
3013 struct btrfs_iget_args *args = opaque;
3014 return args->ino == inode->i_ino &&
3015 args->root == BTRFS_I(inode)->root;
3016}
3017
3018struct inode *btrfs_ilookup(struct super_block *s, u64 objectid,
3019 struct btrfs_root *root, int wait)
3020{
3021 struct inode *inode;
3022 struct btrfs_iget_args args;
3023 args.ino = objectid;
3024 args.root = root;
3025
3026 if (wait) {
3027 inode = ilookup5(s, objectid, btrfs_find_actor,
3028 (void *)&args);
3029 } else {
3030 inode = ilookup5_nowait(s, objectid, btrfs_find_actor,
3031 (void *)&args);
3032 }
3033 return inode;
3034}
3035
3036struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
3037 struct btrfs_root *root)
3038{
3039 struct inode *inode;
3040 struct btrfs_iget_args args;
3041 args.ino = objectid;
3042 args.root = root;
3043
3044 inode = iget5_locked(s, objectid, btrfs_find_actor,
3045 btrfs_init_locked_inode,
3046 (void *)&args);
3047 return inode;
3048}
3049
3050/* Get an inode object given its location and corresponding root.
3051 * Returns in *is_new if the inode was read from disk
3052 */
3053struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
3054 struct btrfs_root *root, int *is_new)
3055{
3056 struct inode *inode;
3057
3058 inode = btrfs_iget_locked(s, location->objectid, root);
3059 if (!inode)
3060 return ERR_PTR(-EACCES);
3061
3062 if (inode->i_state & I_NEW) {
3063 BTRFS_I(inode)->root = root;
3064 memcpy(&BTRFS_I(inode)->location, location, sizeof(*location));
3065 btrfs_read_locked_inode(inode);
3066 unlock_new_inode(inode);
3067 if (is_new)
3068 *is_new = 1;
3069 } else {
3070 if (is_new)
3071 *is_new = 0;
3072 }
3073
3074 return inode;
3075}
3076
3077struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
3078{
3079 struct inode *inode;
3080 struct btrfs_inode *bi = BTRFS_I(dir);
3081 struct btrfs_root *root = bi->root;
3082 struct btrfs_root *sub_root = root;
3083 struct btrfs_key location;
3084 int ret, new;
3085
3086 if (dentry->d_name.len > BTRFS_NAME_LEN)
3087 return ERR_PTR(-ENAMETOOLONG);
3088
3089 ret = btrfs_inode_by_name(dir, dentry, &location);
3090
3091 if (ret < 0)
3092 return ERR_PTR(ret);
3093
3094 inode = NULL;
3095 if (location.objectid) {
3096 ret = fixup_tree_root_location(root, &location, &sub_root,
3097 dentry);
3098 if (ret < 0)
3099 return ERR_PTR(ret);
3100 if (ret > 0)
3101 return ERR_PTR(-ENOENT);
3102 inode = btrfs_iget(dir->i_sb, &location, sub_root, &new);
3103 if (IS_ERR(inode))
3104 return ERR_CAST(inode);
3105 }
3106 return inode;
3107}
3108
3109static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
3110 struct nameidata *nd)
3111{
3112 struct inode *inode;
3113
3114 if (dentry->d_name.len > BTRFS_NAME_LEN)
3115 return ERR_PTR(-ENAMETOOLONG);
3116
3117 inode = btrfs_lookup_dentry(dir, dentry);
3118 if (IS_ERR(inode))
3119 return ERR_CAST(inode);
3120
3121 return d_splice_alias(inode, dentry);
3122}
3123
3124static unsigned char btrfs_filetype_table[] = {
3125 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
3126};
3127
3128static int btrfs_real_readdir(struct file *filp, void *dirent,
3129 filldir_t filldir)
3130{
3131 struct inode *inode = filp->f_dentry->d_inode;
3132 struct btrfs_root *root = BTRFS_I(inode)->root;
3133 struct btrfs_item *item;
3134 struct btrfs_dir_item *di;
3135 struct btrfs_key key;
3136 struct btrfs_key found_key;
3137 struct btrfs_path *path;
3138 int ret;
3139 u32 nritems;
3140 struct extent_buffer *leaf;
3141 int slot;
3142 int advance;
3143 unsigned char d_type;
3144 int over = 0;
3145 u32 di_cur;
3146 u32 di_total;
3147 u32 di_len;
3148 int key_type = BTRFS_DIR_INDEX_KEY;
3149 char tmp_name[32];
3150 char *name_ptr;
3151 int name_len;
3152
3153 /* FIXME, use a real flag for deciding about the key type */
3154 if (root->fs_info->tree_root == root)
3155 key_type = BTRFS_DIR_ITEM_KEY;
3156
3157 /* special case for "." */
3158 if (filp->f_pos == 0) {
3159 over = filldir(dirent, ".", 1,
3160 1, inode->i_ino,
3161 DT_DIR);
3162 if (over)
3163 return 0;
3164 filp->f_pos = 1;
3165 }
3166 /* special case for .., just use the back ref */
3167 if (filp->f_pos == 1) {
3168 u64 pino = parent_ino(filp->f_path.dentry);
3169 over = filldir(dirent, "..", 2,
3170 2, pino, DT_DIR);
3171 if (over)
3172 return 0;
3173 filp->f_pos = 2;
3174 }
3175 path = btrfs_alloc_path();
3176 path->reada = 2;
3177
3178 btrfs_set_key_type(&key, key_type);
3179 key.offset = filp->f_pos;
3180 key.objectid = inode->i_ino;
3181
3182 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3183 if (ret < 0)
3184 goto err;
3185 advance = 0;
3186
3187 while (1) {
3188 leaf = path->nodes[0];
3189 nritems = btrfs_header_nritems(leaf);
3190 slot = path->slots[0];
3191 if (advance || slot >= nritems) {
3192 if (slot >= nritems - 1) {
3193 ret = btrfs_next_leaf(root, path);
3194 if (ret)
3195 break;
3196 leaf = path->nodes[0];
3197 nritems = btrfs_header_nritems(leaf);
3198 slot = path->slots[0];
3199 } else {
3200 slot++;
3201 path->slots[0]++;
3202 }
3203 }
3204
3205 advance = 1;
3206 item = btrfs_item_nr(leaf, slot);
3207 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3208
3209 if (found_key.objectid != key.objectid)
3210 break;
3211 if (btrfs_key_type(&found_key) != key_type)
3212 break;
3213 if (found_key.offset < filp->f_pos)
3214 continue;
3215
3216 filp->f_pos = found_key.offset;
3217
3218 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
3219 di_cur = 0;
3220 di_total = btrfs_item_size(leaf, item);
3221
3222 while (di_cur < di_total) {
3223 struct btrfs_key location;
3224
3225 name_len = btrfs_dir_name_len(leaf, di);
3226 if (name_len <= sizeof(tmp_name)) {
3227 name_ptr = tmp_name;
3228 } else {
3229 name_ptr = kmalloc(name_len, GFP_NOFS);
3230 if (!name_ptr) {
3231 ret = -ENOMEM;
3232 goto err;
3233 }
3234 }
3235 read_extent_buffer(leaf, name_ptr,
3236 (unsigned long)(di + 1), name_len);
3237
3238 d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)];
3239 btrfs_dir_item_key_to_cpu(leaf, di, &location);
3240
3241 /* is this a reference to our own snapshot? If so
3242 * skip it
3243 */
3244 if (location.type == BTRFS_ROOT_ITEM_KEY &&
3245 location.objectid == root->root_key.objectid) {
3246 over = 0;
3247 goto skip;
3248 }
3249 over = filldir(dirent, name_ptr, name_len,
3250 found_key.offset, location.objectid,
3251 d_type);
3252
3253skip:
3254 if (name_ptr != tmp_name)
3255 kfree(name_ptr);
3256
3257 if (over)
3258 goto nopos;
3259 di_len = btrfs_dir_name_len(leaf, di) +
3260 btrfs_dir_data_len(leaf, di) + sizeof(*di);
3261 di_cur += di_len;
3262 di = (struct btrfs_dir_item *)((char *)di + di_len);
3263 }
3264 }
3265
3266 /* Reached end of directory/root. Bump pos past the last item. */
3267 if (key_type == BTRFS_DIR_INDEX_KEY)
3268 filp->f_pos = INT_LIMIT(typeof(filp->f_pos));
3269 else
3270 filp->f_pos++;
3271nopos:
3272 ret = 0;
3273err:
3274 btrfs_free_path(path);
3275 return ret;
3276}
3277
3278int btrfs_write_inode(struct inode *inode, int wait)
3279{
3280 struct btrfs_root *root = BTRFS_I(inode)->root;
3281 struct btrfs_trans_handle *trans;
3282 int ret = 0;
3283
3284 if (root->fs_info->btree_inode == inode)
3285 return 0;
3286
3287 if (wait) {
3288 trans = btrfs_join_transaction(root, 1);
3289 btrfs_set_trans_block_group(trans, inode);
3290 ret = btrfs_commit_transaction(trans, root);
3291 }
3292 return ret;
3293}
3294
3295/*
3296 * This is somewhat expensive, updating the tree every time the
3297 * inode changes. But, it is most likely to find the inode in cache.
3298 * FIXME, needs more benchmarking...there are no reasons other than performance
3299 * to keep or drop this code.
3300 */
3301void btrfs_dirty_inode(struct inode *inode)
3302{
3303 struct btrfs_root *root = BTRFS_I(inode)->root;
3304 struct btrfs_trans_handle *trans;
3305
3306 trans = btrfs_join_transaction(root, 1);
3307 btrfs_set_trans_block_group(trans, inode);
3308 btrfs_update_inode(trans, root, inode);
3309 btrfs_end_transaction(trans, root);
3310}
3311
3312/*
3313 * find the highest existing sequence number in a directory
3314 * and then set the in-memory index_cnt variable to reflect
3315 * free sequence numbers
3316 */
3317static int btrfs_set_inode_index_count(struct inode *inode)
3318{
3319 struct btrfs_root *root = BTRFS_I(inode)->root;
3320 struct btrfs_key key, found_key;
3321 struct btrfs_path *path;
3322 struct extent_buffer *leaf;
3323 int ret;
3324
3325 key.objectid = inode->i_ino;
3326 btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
3327 key.offset = (u64)-1;
3328
3329 path = btrfs_alloc_path();
3330 if (!path)
3331 return -ENOMEM;
3332
3333 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3334 if (ret < 0)
3335 goto out;
3336 /* FIXME: we should be able to handle this */
3337 if (ret == 0)
3338 goto out;
3339 ret = 0;
3340
3341 /*
3342 * MAGIC NUMBER EXPLANATION:
3343 * since we search a directory based on f_pos we have to start at 2
3344 * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody
3345 * else has to start at 2
3346 */
3347 if (path->slots[0] == 0) {
3348 BTRFS_I(inode)->index_cnt = 2;
3349 goto out;
3350 }
3351
3352 path->slots[0]--;
3353
3354 leaf = path->nodes[0];
3355 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3356
3357 if (found_key.objectid != inode->i_ino ||
3358 btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) {
3359 BTRFS_I(inode)->index_cnt = 2;
3360 goto out;
3361 }
3362
3363 BTRFS_I(inode)->index_cnt = found_key.offset + 1;
3364out:
3365 btrfs_free_path(path);
3366 return ret;
3367}
3368
3369/*
3370 * helper to find a free sequence number in a given directory. This current
3371 * code is very simple, later versions will do smarter things in the btree
3372 */
3373int btrfs_set_inode_index(struct inode *dir, u64 *index)
3374{
3375 int ret = 0;
3376
3377 if (BTRFS_I(dir)->index_cnt == (u64)-1) {
3378 ret = btrfs_set_inode_index_count(dir);
3379 if (ret)
3380 return ret;
3381 }
3382
3383 *index = BTRFS_I(dir)->index_cnt;
3384 BTRFS_I(dir)->index_cnt++;
3385
3386 return ret;
3387}
3388
3389static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
3390 struct btrfs_root *root,
3391 struct inode *dir,
3392 const char *name, int name_len,
3393 u64 ref_objectid, u64 objectid,
3394 u64 alloc_hint, int mode, u64 *index)
3395{
3396 struct inode *inode;
3397 struct btrfs_inode_item *inode_item;
3398 struct btrfs_key *location;
3399 struct btrfs_path *path;
3400 struct btrfs_inode_ref *ref;
3401 struct btrfs_key key[2];
3402 u32 sizes[2];
3403 unsigned long ptr;
3404 int ret;
3405 int owner;
3406
3407 path = btrfs_alloc_path();
3408 BUG_ON(!path);
3409
3410 inode = new_inode(root->fs_info->sb);
3411 if (!inode)
3412 return ERR_PTR(-ENOMEM);
3413
3414 if (dir) {
3415 ret = btrfs_set_inode_index(dir, index);
3416 if (ret)
3417 return ERR_PTR(ret);
3418 }
3419 /*
3420 * index_cnt is ignored for everything but a dir,
3421 * btrfs_get_inode_index_count has an explanation for the magic
3422 * number
3423 */
3424 init_btrfs_i(inode);
3425 BTRFS_I(inode)->index_cnt = 2;
3426 BTRFS_I(inode)->root = root;
3427 BTRFS_I(inode)->generation = trans->transid;
3428
3429 if (mode & S_IFDIR)
3430 owner = 0;
3431 else
3432 owner = 1;
3433 BTRFS_I(inode)->block_group =
3434 btrfs_find_block_group(root, 0, alloc_hint, owner);
3435 if ((mode & S_IFREG)) {
3436 if (btrfs_test_opt(root, NODATASUM))
3437 btrfs_set_flag(inode, NODATASUM);
3438 if (btrfs_test_opt(root, NODATACOW))
3439 btrfs_set_flag(inode, NODATACOW);
3440 }
3441
3442 key[0].objectid = objectid;
3443 btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
3444 key[0].offset = 0;
3445
3446 key[1].objectid = objectid;
3447 btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY);
3448 key[1].offset = ref_objectid;
3449
3450 sizes[0] = sizeof(struct btrfs_inode_item);
3451 sizes[1] = name_len + sizeof(*ref);
3452
3453 ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2);
3454 if (ret != 0)
3455 goto fail;
3456
3457 if (objectid > root->highest_inode)
3458 root->highest_inode = objectid;
3459
3460 inode->i_uid = current_fsuid();
3461 inode->i_gid = current_fsgid();
3462 inode->i_mode = mode;
3463 inode->i_ino = objectid;
3464 inode_set_bytes(inode, 0);
3465 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
3466 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3467 struct btrfs_inode_item);
3468 fill_inode_item(trans, path->nodes[0], inode_item, inode);
3469
3470 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
3471 struct btrfs_inode_ref);
3472 btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
3473 btrfs_set_inode_ref_index(path->nodes[0], ref, *index);
3474 ptr = (unsigned long)(ref + 1);
3475 write_extent_buffer(path->nodes[0], name, ptr, name_len);
3476
3477 btrfs_mark_buffer_dirty(path->nodes[0]);
3478 btrfs_free_path(path);
3479
3480 location = &BTRFS_I(inode)->location;
3481 location->objectid = objectid;
3482 location->offset = 0;
3483 btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
3484
3485 insert_inode_hash(inode);
3486 return inode;
3487fail:
3488 if (dir)
3489 BTRFS_I(dir)->index_cnt--;
3490 btrfs_free_path(path);
3491 return ERR_PTR(ret);
3492}
3493
3494static inline u8 btrfs_inode_type(struct inode *inode)
3495{
3496 return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT];
3497}
3498
3499/*
3500 * utility function to add 'inode' into 'parent_inode' with
3501 * a give name and a given sequence number.
3502 * if 'add_backref' is true, also insert a backref from the
3503 * inode to the parent directory.
3504 */
3505int btrfs_add_link(struct btrfs_trans_handle *trans,
3506 struct inode *parent_inode, struct inode *inode,
3507 const char *name, int name_len, int add_backref, u64 index)
3508{
3509 int ret;
3510 struct btrfs_key key;
3511 struct btrfs_root *root = BTRFS_I(parent_inode)->root;
3512
3513 key.objectid = inode->i_ino;
3514 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
3515 key.offset = 0;
3516
3517 ret = btrfs_insert_dir_item(trans, root, name, name_len,
3518 parent_inode->i_ino,
3519 &key, btrfs_inode_type(inode),
3520 index);
3521 if (ret == 0) {
3522 if (add_backref) {
3523 ret = btrfs_insert_inode_ref(trans, root,
3524 name, name_len,
3525 inode->i_ino,
3526 parent_inode->i_ino,
3527 index);
3528 }
3529 btrfs_i_size_write(parent_inode, parent_inode->i_size +
3530 name_len * 2);
3531 parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
3532 ret = btrfs_update_inode(trans, root, parent_inode);
3533 }
3534 return ret;
3535}
3536
3537static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
3538 struct dentry *dentry, struct inode *inode,
3539 int backref, u64 index)
3540{
3541 int err = btrfs_add_link(trans, dentry->d_parent->d_inode,
3542 inode, dentry->d_name.name,
3543 dentry->d_name.len, backref, index);
3544 if (!err) {
3545 d_instantiate(dentry, inode);
3546 return 0;
3547 }
3548 if (err > 0)
3549 err = -EEXIST;
3550 return err;
3551}
3552
3553static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
3554 int mode, dev_t rdev)
3555{
3556 struct btrfs_trans_handle *trans;
3557 struct btrfs_root *root = BTRFS_I(dir)->root;
3558 struct inode *inode = NULL;
3559 int err;
3560 int drop_inode = 0;
3561 u64 objectid;
3562 unsigned long nr = 0;
3563 u64 index = 0;
3564
3565 if (!new_valid_dev(rdev))
3566 return -EINVAL;
3567
3568 err = btrfs_check_free_space(root, 1, 0);
3569 if (err)
3570 goto fail;
3571
3572 trans = btrfs_start_transaction(root, 1);
3573 btrfs_set_trans_block_group(trans, dir);
3574
3575 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
3576 if (err) {
3577 err = -ENOSPC;
3578 goto out_unlock;
3579 }
3580
3581 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
3582 dentry->d_name.len,
3583 dentry->d_parent->d_inode->i_ino, objectid,
3584 BTRFS_I(dir)->block_group, mode, &index);
3585 err = PTR_ERR(inode);
3586 if (IS_ERR(inode))
3587 goto out_unlock;
3588
3589 err = btrfs_init_acl(inode, dir);
3590 if (err) {
3591 drop_inode = 1;
3592 goto out_unlock;
3593 }
3594
3595 btrfs_set_trans_block_group(trans, inode);
3596 err = btrfs_add_nondir(trans, dentry, inode, 0, index);
3597 if (err)
3598 drop_inode = 1;
3599 else {
3600 inode->i_op = &btrfs_special_inode_operations;
3601 init_special_inode(inode, inode->i_mode, rdev);
3602 btrfs_update_inode(trans, root, inode);
3603 }
3604 dir->i_sb->s_dirt = 1;
3605 btrfs_update_inode_block_group(trans, inode);
3606 btrfs_update_inode_block_group(trans, dir);
3607out_unlock:
3608 nr = trans->blocks_used;
3609 btrfs_end_transaction_throttle(trans, root);
3610fail:
3611 if (drop_inode) {
3612 inode_dec_link_count(inode);
3613 iput(inode);
3614 }
3615 btrfs_btree_balance_dirty(root, nr);
3616 return err;
3617}
3618
3619static int btrfs_create(struct inode *dir, struct dentry *dentry,
3620 int mode, struct nameidata *nd)
3621{
3622 struct btrfs_trans_handle *trans;
3623 struct btrfs_root *root = BTRFS_I(dir)->root;
3624 struct inode *inode = NULL;
3625 int err;
3626 int drop_inode = 0;
3627 unsigned long nr = 0;
3628 u64 objectid;
3629 u64 index = 0;
3630
3631 err = btrfs_check_free_space(root, 1, 0);
3632 if (err)
3633 goto fail;
3634 trans = btrfs_start_transaction(root, 1);
3635 btrfs_set_trans_block_group(trans, dir);
3636
3637 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
3638 if (err) {
3639 err = -ENOSPC;
3640 goto out_unlock;
3641 }
3642
3643 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
3644 dentry->d_name.len,
3645 dentry->d_parent->d_inode->i_ino,
3646 objectid, BTRFS_I(dir)->block_group, mode,
3647 &index);
3648 err = PTR_ERR(inode);
3649 if (IS_ERR(inode))
3650 goto out_unlock;
3651
3652 err = btrfs_init_acl(inode, dir);
3653 if (err) {
3654 drop_inode = 1;
3655 goto out_unlock;
3656 }
3657
3658 btrfs_set_trans_block_group(trans, inode);
3659 err = btrfs_add_nondir(trans, dentry, inode, 0, index);
3660 if (err)
3661 drop_inode = 1;
3662 else {
3663 inode->i_mapping->a_ops = &btrfs_aops;
3664 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
3665 inode->i_fop = &btrfs_file_operations;
3666 inode->i_op = &btrfs_file_inode_operations;
3667 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
3668 }
3669 dir->i_sb->s_dirt = 1;
3670 btrfs_update_inode_block_group(trans, inode);
3671 btrfs_update_inode_block_group(trans, dir);
3672out_unlock:
3673 nr = trans->blocks_used;
3674 btrfs_end_transaction_throttle(trans, root);
3675fail:
3676 if (drop_inode) {
3677 inode_dec_link_count(inode);
3678 iput(inode);
3679 }
3680 btrfs_btree_balance_dirty(root, nr);
3681 return err;
3682}
3683
3684static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
3685 struct dentry *dentry)
3686{
3687 struct btrfs_trans_handle *trans;
3688 struct btrfs_root *root = BTRFS_I(dir)->root;
3689 struct inode *inode = old_dentry->d_inode;
3690 u64 index;
3691 unsigned long nr = 0;
3692 int err;
3693 int drop_inode = 0;
3694
3695 if (inode->i_nlink == 0)
3696 return -ENOENT;
3697
3698 btrfs_inc_nlink(inode);
3699 err = btrfs_check_free_space(root, 1, 0);
3700 if (err)
3701 goto fail;
3702 err = btrfs_set_inode_index(dir, &index);
3703 if (err)
3704 goto fail;
3705
3706 trans = btrfs_start_transaction(root, 1);
3707
3708 btrfs_set_trans_block_group(trans, dir);
3709 atomic_inc(&inode->i_count);
3710
3711 err = btrfs_add_nondir(trans, dentry, inode, 1, index);
3712
3713 if (err)
3714 drop_inode = 1;
3715
3716 dir->i_sb->s_dirt = 1;
3717 btrfs_update_inode_block_group(trans, dir);
3718 err = btrfs_update_inode(trans, root, inode);
3719
3720 if (err)
3721 drop_inode = 1;
3722
3723 nr = trans->blocks_used;
3724 btrfs_end_transaction_throttle(trans, root);
3725fail:
3726 if (drop_inode) {
3727 inode_dec_link_count(inode);
3728 iput(inode);
3729 }
3730 btrfs_btree_balance_dirty(root, nr);
3731 return err;
3732}
3733
3734static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
3735{
3736 struct inode *inode = NULL;
3737 struct btrfs_trans_handle *trans;
3738 struct btrfs_root *root = BTRFS_I(dir)->root;
3739 int err = 0;
3740 int drop_on_err = 0;
3741 u64 objectid = 0;
3742 u64 index = 0;
3743 unsigned long nr = 1;
3744
3745 err = btrfs_check_free_space(root, 1, 0);
3746 if (err)
3747 goto out_unlock;
3748
3749 trans = btrfs_start_transaction(root, 1);
3750 btrfs_set_trans_block_group(trans, dir);
3751
3752 if (IS_ERR(trans)) {
3753 err = PTR_ERR(trans);
3754 goto out_unlock;
3755 }
3756
3757 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
3758 if (err) {
3759 err = -ENOSPC;
3760 goto out_unlock;
3761 }
3762
3763 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
3764 dentry->d_name.len,
3765 dentry->d_parent->d_inode->i_ino, objectid,
3766 BTRFS_I(dir)->block_group, S_IFDIR | mode,
3767 &index);
3768 if (IS_ERR(inode)) {
3769 err = PTR_ERR(inode);
3770 goto out_fail;
3771 }
3772
3773 drop_on_err = 1;
3774
3775 err = btrfs_init_acl(inode, dir);
3776 if (err)
3777 goto out_fail;
3778
3779 inode->i_op = &btrfs_dir_inode_operations;
3780 inode->i_fop = &btrfs_dir_file_operations;
3781 btrfs_set_trans_block_group(trans, inode);
3782
3783 btrfs_i_size_write(inode, 0);
3784 err = btrfs_update_inode(trans, root, inode);
3785 if (err)
3786 goto out_fail;
3787
3788 err = btrfs_add_link(trans, dentry->d_parent->d_inode,
3789 inode, dentry->d_name.name,
3790 dentry->d_name.len, 0, index);
3791 if (err)
3792 goto out_fail;
3793
3794 d_instantiate(dentry, inode);
3795 drop_on_err = 0;
3796 dir->i_sb->s_dirt = 1;
3797 btrfs_update_inode_block_group(trans, inode);
3798 btrfs_update_inode_block_group(trans, dir);
3799
3800out_fail:
3801 nr = trans->blocks_used;
3802 btrfs_end_transaction_throttle(trans, root);
3803
3804out_unlock:
3805 if (drop_on_err)
3806 iput(inode);
3807 btrfs_btree_balance_dirty(root, nr);
3808 return err;
3809}
3810
3811/* helper for btfs_get_extent. Given an existing extent in the tree,
3812 * and an extent that you want to insert, deal with overlap and insert
3813 * the new extent into the tree.
3814 */
3815static int merge_extent_mapping(struct extent_map_tree *em_tree,
3816 struct extent_map *existing,
3817 struct extent_map *em,
3818 u64 map_start, u64 map_len)
3819{
3820 u64 start_diff;
3821
3822 BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
3823 start_diff = map_start - em->start;
3824 em->start = map_start;
3825 em->len = map_len;
3826 if (em->block_start < EXTENT_MAP_LAST_BYTE &&
3827 !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
3828 em->block_start += start_diff;
3829 em->block_len -= start_diff;
3830 }
3831 return add_extent_mapping(em_tree, em);
3832}
3833
3834static noinline int uncompress_inline(struct btrfs_path *path,
3835 struct inode *inode, struct page *page,
3836 size_t pg_offset, u64 extent_offset,
3837 struct btrfs_file_extent_item *item)
3838{
3839 int ret;
3840 struct extent_buffer *leaf = path->nodes[0];
3841 char *tmp;
3842 size_t max_size;
3843 unsigned long inline_size;
3844 unsigned long ptr;
3845
3846 WARN_ON(pg_offset != 0);
3847 max_size = btrfs_file_extent_ram_bytes(leaf, item);
3848 inline_size = btrfs_file_extent_inline_item_len(leaf,
3849 btrfs_item_nr(leaf, path->slots[0]));
3850 tmp = kmalloc(inline_size, GFP_NOFS);
3851 ptr = btrfs_file_extent_inline_start(item);
3852
3853 read_extent_buffer(leaf, tmp, ptr, inline_size);
3854
3855 max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);
3856 ret = btrfs_zlib_decompress(tmp, page, extent_offset,
3857 inline_size, max_size);
3858 if (ret) {
3859 char *kaddr = kmap_atomic(page, KM_USER0);
3860 unsigned long copy_size = min_t(u64,
3861 PAGE_CACHE_SIZE - pg_offset,
3862 max_size - extent_offset);
3863 memset(kaddr + pg_offset, 0, copy_size);
3864 kunmap_atomic(kaddr, KM_USER0);
3865 }
3866 kfree(tmp);
3867 return 0;
3868}
3869
3870/*
3871 * a bit scary, this does extent mapping from logical file offset to the disk.
3872 * the ugly parts come from merging extents from the disk with the in-ram
3873 * representation. This gets more complex because of the data=ordered code,
3874 * where the in-ram extents might be locked pending data=ordered completion.
3875 *
3876 * This also copies inline extents directly into the page.
3877 */
3878
3879struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
3880 size_t pg_offset, u64 start, u64 len,
3881 int create)
3882{
3883 int ret;
3884 int err = 0;
3885 u64 bytenr;
3886 u64 extent_start = 0;
3887 u64 extent_end = 0;
3888 u64 objectid = inode->i_ino;
3889 u32 found_type;
3890 struct btrfs_path *path = NULL;
3891 struct btrfs_root *root = BTRFS_I(inode)->root;
3892 struct btrfs_file_extent_item *item;
3893 struct extent_buffer *leaf;
3894 struct btrfs_key found_key;
3895 struct extent_map *em = NULL;
3896 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
3897 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3898 struct btrfs_trans_handle *trans = NULL;
3899 int compressed;
3900
3901again:
3902 spin_lock(&em_tree->lock);
3903 em = lookup_extent_mapping(em_tree, start, len);
3904 if (em)
3905 em->bdev = root->fs_info->fs_devices->latest_bdev;
3906 spin_unlock(&em_tree->lock);
3907
3908 if (em) {
3909 if (em->start > start || em->start + em->len <= start)
3910 free_extent_map(em);
3911 else if (em->block_start == EXTENT_MAP_INLINE && page)
3912 free_extent_map(em);
3913 else
3914 goto out;
3915 }
3916 em = alloc_extent_map(GFP_NOFS);
3917 if (!em) {
3918 err = -ENOMEM;
3919 goto out;
3920 }
3921 em->bdev = root->fs_info->fs_devices->latest_bdev;
3922 em->start = EXTENT_MAP_HOLE;
3923 em->orig_start = EXTENT_MAP_HOLE;
3924 em->len = (u64)-1;
3925 em->block_len = (u64)-1;
3926
3927 if (!path) {
3928 path = btrfs_alloc_path();
3929 BUG_ON(!path);
3930 }
3931
3932 ret = btrfs_lookup_file_extent(trans, root, path,
3933 objectid, start, trans != NULL);
3934 if (ret < 0) {
3935 err = ret;
3936 goto out;
3937 }
3938
3939 if (ret != 0) {
3940 if (path->slots[0] == 0)
3941 goto not_found;
3942 path->slots[0]--;
3943 }
3944
3945 leaf = path->nodes[0];
3946 item = btrfs_item_ptr(leaf, path->slots[0],
3947 struct btrfs_file_extent_item);
3948 /* are we inside the extent that was found? */
3949 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3950 found_type = btrfs_key_type(&found_key);
3951 if (found_key.objectid != objectid ||
3952 found_type != BTRFS_EXTENT_DATA_KEY) {
3953 goto not_found;
3954 }
3955
3956 found_type = btrfs_file_extent_type(leaf, item);
3957 extent_start = found_key.offset;
3958 compressed = btrfs_file_extent_compression(leaf, item);
3959 if (found_type == BTRFS_FILE_EXTENT_REG ||
3960 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
3961 extent_end = extent_start +
3962 btrfs_file_extent_num_bytes(leaf, item);
3963 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
3964 size_t size;
3965 size = btrfs_file_extent_inline_len(leaf, item);
3966 extent_end = (extent_start + size + root->sectorsize - 1) &
3967 ~((u64)root->sectorsize - 1);
3968 }
3969
3970 if (start >= extent_end) {
3971 path->slots[0]++;
3972 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
3973 ret = btrfs_next_leaf(root, path);
3974 if (ret < 0) {
3975 err = ret;
3976 goto out;
3977 }
3978 if (ret > 0)
3979 goto not_found;
3980 leaf = path->nodes[0];
3981 }
3982 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3983 if (found_key.objectid != objectid ||
3984 found_key.type != BTRFS_EXTENT_DATA_KEY)
3985 goto not_found;
3986 if (start + len <= found_key.offset)
3987 goto not_found;
3988 em->start = start;
3989 em->len = found_key.offset - start;
3990 goto not_found_em;
3991 }
3992
3993 if (found_type == BTRFS_FILE_EXTENT_REG ||
3994 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
3995 em->start = extent_start;
3996 em->len = extent_end - extent_start;
3997 em->orig_start = extent_start -
3998 btrfs_file_extent_offset(leaf, item);
3999 bytenr = btrfs_file_extent_disk_bytenr(leaf, item);
4000 if (bytenr == 0) {
4001 em->block_start = EXTENT_MAP_HOLE;
4002 goto insert;
4003 }
4004 if (compressed) {
4005 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
4006 em->block_start = bytenr;
4007 em->block_len = btrfs_file_extent_disk_num_bytes(leaf,
4008 item);
4009 } else {
4010 bytenr += btrfs_file_extent_offset(leaf, item);
4011 em->block_start = bytenr;
4012 em->block_len = em->len;
4013 if (found_type == BTRFS_FILE_EXTENT_PREALLOC)
4014 set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
4015 }
4016 goto insert;
4017 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
4018 unsigned long ptr;
4019 char *map;
4020 size_t size;
4021 size_t extent_offset;
4022 size_t copy_size;
4023
4024 em->block_start = EXTENT_MAP_INLINE;
4025 if (!page || create) {
4026 em->start = extent_start;
4027 em->len = extent_end - extent_start;
4028 goto out;
4029 }
4030
4031 size = btrfs_file_extent_inline_len(leaf, item);
4032 extent_offset = page_offset(page) + pg_offset - extent_start;
4033 copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset,
4034 size - extent_offset);
4035 em->start = extent_start + extent_offset;
4036 em->len = (copy_size + root->sectorsize - 1) &
4037 ~((u64)root->sectorsize - 1);
4038 em->orig_start = EXTENT_MAP_INLINE;
4039 if (compressed)
4040 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
4041 ptr = btrfs_file_extent_inline_start(item) + extent_offset;
4042 if (create == 0 && !PageUptodate(page)) {
4043 if (btrfs_file_extent_compression(leaf, item) ==
4044 BTRFS_COMPRESS_ZLIB) {
4045 ret = uncompress_inline(path, inode, page,
4046 pg_offset,
4047 extent_offset, item);
4048 BUG_ON(ret);
4049 } else {
4050 map = kmap(page);
4051 read_extent_buffer(leaf, map + pg_offset, ptr,
4052 copy_size);
4053 kunmap(page);
4054 }
4055 flush_dcache_page(page);
4056 } else if (create && PageUptodate(page)) {
4057 if (!trans) {
4058 kunmap(page);
4059 free_extent_map(em);
4060 em = NULL;
4061 btrfs_release_path(root, path);
4062 trans = btrfs_join_transaction(root, 1);
4063 goto again;
4064 }
4065 map = kmap(page);
4066 write_extent_buffer(leaf, map + pg_offset, ptr,
4067 copy_size);
4068 kunmap(page);
4069 btrfs_mark_buffer_dirty(leaf);
4070 }
4071 set_extent_uptodate(io_tree, em->start,
4072 extent_map_end(em) - 1, GFP_NOFS);
4073 goto insert;
4074 } else {
4075 printk(KERN_ERR "btrfs unknown found_type %d\n", found_type);
4076 WARN_ON(1);
4077 }
4078not_found:
4079 em->start = start;
4080 em->len = len;
4081not_found_em:
4082 em->block_start = EXTENT_MAP_HOLE;
4083 set_bit(EXTENT_FLAG_VACANCY, &em->flags);
4084insert:
4085 btrfs_release_path(root, path);
4086 if (em->start > start || extent_map_end(em) <= start) {
4087 printk(KERN_ERR "Btrfs: bad extent! em: [%llu %llu] passed "
4088 "[%llu %llu]\n", (unsigned long long)em->start,
4089 (unsigned long long)em->len,
4090 (unsigned long long)start,
4091 (unsigned long long)len);
4092 err = -EIO;
4093 goto out;
4094 }
4095
4096 err = 0;
4097 spin_lock(&em_tree->lock);
4098 ret = add_extent_mapping(em_tree, em);
4099 /* it is possible that someone inserted the extent into the tree
4100 * while we had the lock dropped. It is also possible that
4101 * an overlapping map exists in the tree
4102 */
4103 if (ret == -EEXIST) {
4104 struct extent_map *existing;
4105
4106 ret = 0;
4107
4108 existing = lookup_extent_mapping(em_tree, start, len);
4109 if (existing && (existing->start > start ||
4110 existing->start + existing->len <= start)) {
4111 free_extent_map(existing);
4112 existing = NULL;
4113 }
4114 if (!existing) {
4115 existing = lookup_extent_mapping(em_tree, em->start,
4116 em->len);
4117 if (existing) {
4118 err = merge_extent_mapping(em_tree, existing,
4119 em, start,
4120 root->sectorsize);
4121 free_extent_map(existing);
4122 if (err) {
4123 free_extent_map(em);
4124 em = NULL;
4125 }
4126 } else {
4127 err = -EIO;
4128 free_extent_map(em);
4129 em = NULL;
4130 }
4131 } else {
4132 free_extent_map(em);
4133 em = existing;
4134 err = 0;
4135 }
4136 }
4137 spin_unlock(&em_tree->lock);
4138out:
4139 if (path)
4140 btrfs_free_path(path);
4141 if (trans) {
4142 ret = btrfs_end_transaction(trans, root);
4143 if (!err)
4144 err = ret;
4145 }
4146 if (err) {
4147 free_extent_map(em);
4148 WARN_ON(1);
4149 return ERR_PTR(err);
4150 }
4151 return em;
4152}
4153
4154static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
4155 const struct iovec *iov, loff_t offset,
4156 unsigned long nr_segs)
4157{
4158 return -EINVAL;
4159}
4160
4161static sector_t btrfs_bmap(struct address_space *mapping, sector_t iblock)
4162{
4163 return extent_bmap(mapping, iblock, btrfs_get_extent);
4164}
4165
4166int btrfs_readpage(struct file *file, struct page *page)
4167{
4168 struct extent_io_tree *tree;
4169 tree = &BTRFS_I(page->mapping->host)->io_tree;
4170 return extent_read_full_page(tree, page, btrfs_get_extent);
4171}
4172
4173static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
4174{
4175 struct extent_io_tree *tree;
4176
4177
4178 if (current->flags & PF_MEMALLOC) {
4179 redirty_page_for_writepage(wbc, page);
4180 unlock_page(page);
4181 return 0;
4182 }
4183 tree = &BTRFS_I(page->mapping->host)->io_tree;
4184 return extent_write_full_page(tree, page, btrfs_get_extent, wbc);
4185}
4186
4187int btrfs_writepages(struct address_space *mapping,
4188 struct writeback_control *wbc)
4189{
4190 struct extent_io_tree *tree;
4191
4192 tree = &BTRFS_I(mapping->host)->io_tree;
4193 return extent_writepages(tree, mapping, btrfs_get_extent, wbc);
4194}
4195
4196static int
4197btrfs_readpages(struct file *file, struct address_space *mapping,
4198 struct list_head *pages, unsigned nr_pages)
4199{
4200 struct extent_io_tree *tree;
4201 tree = &BTRFS_I(mapping->host)->io_tree;
4202 return extent_readpages(tree, mapping, pages, nr_pages,
4203 btrfs_get_extent);
4204}
4205static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
4206{
4207 struct extent_io_tree *tree;
4208 struct extent_map_tree *map;
4209 int ret;
4210
4211 tree = &BTRFS_I(page->mapping->host)->io_tree;
4212 map = &BTRFS_I(page->mapping->host)->extent_tree;
4213 ret = try_release_extent_mapping(map, tree, page, gfp_flags);
4214 if (ret == 1) {
4215 ClearPagePrivate(page);
4216 set_page_private(page, 0);
4217 page_cache_release(page);
4218 }
4219 return ret;
4220}
4221
4222static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
4223{
4224 if (PageWriteback(page) || PageDirty(page))
4225 return 0;
4226 return __btrfs_releasepage(page, gfp_flags);
4227}
4228
4229static void btrfs_invalidatepage(struct page *page, unsigned long offset)
4230{
4231 struct extent_io_tree *tree;
4232 struct btrfs_ordered_extent *ordered;
4233 u64 page_start = page_offset(page);
4234 u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
4235
4236 wait_on_page_writeback(page);
4237 tree = &BTRFS_I(page->mapping->host)->io_tree;
4238 if (offset) {
4239 btrfs_releasepage(page, GFP_NOFS);
4240 return;
4241 }
4242
4243 lock_extent(tree, page_start, page_end, GFP_NOFS);
4244 ordered = btrfs_lookup_ordered_extent(page->mapping->host,
4245 page_offset(page));
4246 if (ordered) {
4247 /*
4248 * IO on this page will never be started, so we need
4249 * to account for any ordered extents now
4250 */
4251 clear_extent_bit(tree, page_start, page_end,
4252 EXTENT_DIRTY | EXTENT_DELALLOC |
4253 EXTENT_LOCKED, 1, 0, GFP_NOFS);
4254 btrfs_finish_ordered_io(page->mapping->host,
4255 page_start, page_end);
4256 btrfs_put_ordered_extent(ordered);
4257 lock_extent(tree, page_start, page_end, GFP_NOFS);
4258 }
4259 clear_extent_bit(tree, page_start, page_end,
4260 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
4261 EXTENT_ORDERED,
4262 1, 1, GFP_NOFS);
4263 __btrfs_releasepage(page, GFP_NOFS);
4264
4265 ClearPageChecked(page);
4266 if (PagePrivate(page)) {
4267 ClearPagePrivate(page);
4268 set_page_private(page, 0);
4269 page_cache_release(page);
4270 }
4271}
4272
4273/*
4274 * btrfs_page_mkwrite() is not allowed to change the file size as it gets
4275 * called from a page fault handler when a page is first dirtied. Hence we must
4276 * be careful to check for EOF conditions here. We set the page up correctly
4277 * for a written page which means we get ENOSPC checking when writing into
4278 * holes and correct delalloc and unwritten extent mapping on filesystems that
4279 * support these features.
4280 *
4281 * We are not allowed to take the i_mutex here so we have to play games to
4282 * protect against truncate races as the page could now be beyond EOF. Because
4283 * vmtruncate() writes the inode size before removing pages, once we have the
4284 * page lock we can determine safely if the page is beyond EOF. If it is not
4285 * beyond EOF, then the page is guaranteed safe against truncation until we
4286 * unlock the page.
4287 */
4288int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
4289{
4290 struct inode *inode = fdentry(vma->vm_file)->d_inode;
4291 struct btrfs_root *root = BTRFS_I(inode)->root;
4292 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4293 struct btrfs_ordered_extent *ordered;
4294 char *kaddr;
4295 unsigned long zero_start;
4296 loff_t size;
4297 int ret;
4298 u64 page_start;
4299 u64 page_end;
4300
4301 ret = btrfs_check_free_space(root, PAGE_CACHE_SIZE, 0);
4302 if (ret)
4303 goto out;
4304
4305 ret = -EINVAL;
4306again:
4307 lock_page(page);
4308 size = i_size_read(inode);
4309 page_start = page_offset(page);
4310 page_end = page_start + PAGE_CACHE_SIZE - 1;
4311
4312 if ((page->mapping != inode->i_mapping) ||
4313 (page_start >= size)) {
4314 /* page got truncated out from underneath us */
4315 goto out_unlock;
4316 }
4317 wait_on_page_writeback(page);
4318
4319 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
4320 set_page_extent_mapped(page);
4321
4322 /*
4323 * we can't set the delalloc bits if there are pending ordered
4324 * extents. Drop our locks and wait for them to finish
4325 */
4326 ordered = btrfs_lookup_ordered_extent(inode, page_start);
4327 if (ordered) {
4328 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
4329 unlock_page(page);
4330 btrfs_start_ordered_extent(inode, ordered, 1);
4331 btrfs_put_ordered_extent(ordered);
4332 goto again;
4333 }
4334
4335 btrfs_set_extent_delalloc(inode, page_start, page_end);
4336 ret = 0;
4337
4338 /* page is wholly or partially inside EOF */
4339 if (page_start + PAGE_CACHE_SIZE > size)
4340 zero_start = size & ~PAGE_CACHE_MASK;
4341 else
4342 zero_start = PAGE_CACHE_SIZE;
4343
4344 if (zero_start != PAGE_CACHE_SIZE) {
4345 kaddr = kmap(page);
4346 memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start);
4347 flush_dcache_page(page);
4348 kunmap(page);
4349 }
4350 ClearPageChecked(page);
4351 set_page_dirty(page);
4352 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
4353
4354out_unlock:
4355 unlock_page(page);
4356out:
4357 return ret;
4358}
4359
4360static void btrfs_truncate(struct inode *inode)
4361{
4362 struct btrfs_root *root = BTRFS_I(inode)->root;
4363 int ret;
4364 struct btrfs_trans_handle *trans;
4365 unsigned long nr;
4366 u64 mask = root->sectorsize - 1;
4367
4368 if (!S_ISREG(inode->i_mode))
4369 return;
4370 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
4371 return;
4372
4373 btrfs_truncate_page(inode->i_mapping, inode->i_size);
4374 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
4375
4376 trans = btrfs_start_transaction(root, 1);
4377 btrfs_set_trans_block_group(trans, inode);
4378 btrfs_i_size_write(inode, inode->i_size);
4379
4380 ret = btrfs_orphan_add(trans, inode);
4381 if (ret)
4382 goto out;
4383 /* FIXME, add redo link to tree so we don't leak on crash */
4384 ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size,
4385 BTRFS_EXTENT_DATA_KEY);
4386 btrfs_update_inode(trans, root, inode);
4387
4388 ret = btrfs_orphan_del(trans, inode);
4389 BUG_ON(ret);
4390
4391out:
4392 nr = trans->blocks_used;
4393 ret = btrfs_end_transaction_throttle(trans, root);
4394 BUG_ON(ret);
4395 btrfs_btree_balance_dirty(root, nr);
4396}
4397
4398/*
4399 * create a new subvolume directory/inode (helper for the ioctl).
4400 */
4401int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
4402 struct btrfs_root *new_root, struct dentry *dentry,
4403 u64 new_dirid, u64 alloc_hint)
4404{
4405 struct inode *inode;
4406 int error;
4407 u64 index = 0;
4408
4409 inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid,
4410 new_dirid, alloc_hint, S_IFDIR | 0700, &index);
4411 if (IS_ERR(inode))
4412 return PTR_ERR(inode);
4413 inode->i_op = &btrfs_dir_inode_operations;
4414 inode->i_fop = &btrfs_dir_file_operations;
4415
4416 inode->i_nlink = 1;
4417 btrfs_i_size_write(inode, 0);
4418
4419 error = btrfs_update_inode(trans, new_root, inode);
4420 if (error)
4421 return error;
4422
4423 d_instantiate(dentry, inode);
4424 return 0;
4425}
4426
4427/* helper function for file defrag and space balancing. This
4428 * forces readahead on a given range of bytes in an inode
4429 */
4430unsigned long btrfs_force_ra(struct address_space *mapping,
4431 struct file_ra_state *ra, struct file *file,
4432 pgoff_t offset, pgoff_t last_index)
4433{
4434 pgoff_t req_size = last_index - offset + 1;
4435
4436 page_cache_sync_readahead(mapping, ra, file, offset, req_size);
4437 return offset + req_size;
4438}
4439
4440struct inode *btrfs_alloc_inode(struct super_block *sb)
4441{
4442 struct btrfs_inode *ei;
4443
4444 ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
4445 if (!ei)
4446 return NULL;
4447 ei->last_trans = 0;
4448 ei->logged_trans = 0;
4449 btrfs_ordered_inode_tree_init(&ei->ordered_tree);
4450 ei->i_acl = BTRFS_ACL_NOT_CACHED;
4451 ei->i_default_acl = BTRFS_ACL_NOT_CACHED;
4452 INIT_LIST_HEAD(&ei->i_orphan);
4453 return &ei->vfs_inode;
4454}
4455
4456void btrfs_destroy_inode(struct inode *inode)
4457{
4458 struct btrfs_ordered_extent *ordered;
4459 WARN_ON(!list_empty(&inode->i_dentry));
4460 WARN_ON(inode->i_data.nrpages);
4461
4462 if (BTRFS_I(inode)->i_acl &&
4463 BTRFS_I(inode)->i_acl != BTRFS_ACL_NOT_CACHED)
4464 posix_acl_release(BTRFS_I(inode)->i_acl);
4465 if (BTRFS_I(inode)->i_default_acl &&
4466 BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED)
4467 posix_acl_release(BTRFS_I(inode)->i_default_acl);
4468
4469 spin_lock(&BTRFS_I(inode)->root->list_lock);
4470 if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
4471 printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan"
4472 " list\n", inode->i_ino);
4473 dump_stack();
4474 }
4475 spin_unlock(&BTRFS_I(inode)->root->list_lock);
4476
4477 while (1) {
4478 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
4479 if (!ordered)
4480 break;
4481 else {
4482 printk(KERN_ERR "btrfs found ordered "
4483 "extent %llu %llu on inode cleanup\n",
4484 (unsigned long long)ordered->file_offset,
4485 (unsigned long long)ordered->len);
4486 btrfs_remove_ordered_extent(inode, ordered);
4487 btrfs_put_ordered_extent(ordered);
4488 btrfs_put_ordered_extent(ordered);
4489 }
4490 }
4491 btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
4492 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
4493}
4494
4495static void init_once(void *foo)
4496{
4497 struct btrfs_inode *ei = (struct btrfs_inode *) foo;
4498
4499 inode_init_once(&ei->vfs_inode);
4500}
4501
4502void btrfs_destroy_cachep(void)
4503{
4504 if (btrfs_inode_cachep)
4505 kmem_cache_destroy(btrfs_inode_cachep);
4506 if (btrfs_trans_handle_cachep)
4507 kmem_cache_destroy(btrfs_trans_handle_cachep);
4508 if (btrfs_transaction_cachep)
4509 kmem_cache_destroy(btrfs_transaction_cachep);
4510 if (btrfs_bit_radix_cachep)
4511 kmem_cache_destroy(btrfs_bit_radix_cachep);
4512 if (btrfs_path_cachep)
4513 kmem_cache_destroy(btrfs_path_cachep);
4514}
4515
4516struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
4517 unsigned long extra_flags,
4518 void (*ctor)(void *))
4519{
4520 return kmem_cache_create(name, size, 0, (SLAB_RECLAIM_ACCOUNT |
4521 SLAB_MEM_SPREAD | extra_flags), ctor);
4522}
4523
4524int btrfs_init_cachep(void)
4525{
4526 btrfs_inode_cachep = btrfs_cache_create("btrfs_inode_cache",
4527 sizeof(struct btrfs_inode),
4528 0, init_once);
4529 if (!btrfs_inode_cachep)
4530 goto fail;
4531 btrfs_trans_handle_cachep =
4532 btrfs_cache_create("btrfs_trans_handle_cache",
4533 sizeof(struct btrfs_trans_handle),
4534 0, NULL);
4535 if (!btrfs_trans_handle_cachep)
4536 goto fail;
4537 btrfs_transaction_cachep = btrfs_cache_create("btrfs_transaction_cache",
4538 sizeof(struct btrfs_transaction),
4539 0, NULL);
4540 if (!btrfs_transaction_cachep)
4541 goto fail;
4542 btrfs_path_cachep = btrfs_cache_create("btrfs_path_cache",
4543 sizeof(struct btrfs_path),
4544 0, NULL);
4545 if (!btrfs_path_cachep)
4546 goto fail;
4547 btrfs_bit_radix_cachep = btrfs_cache_create("btrfs_radix", 256,
4548 SLAB_DESTROY_BY_RCU, NULL);
4549 if (!btrfs_bit_radix_cachep)
4550 goto fail;
4551 return 0;
4552fail:
4553 btrfs_destroy_cachep();
4554 return -ENOMEM;
4555}
4556
4557static int btrfs_getattr(struct vfsmount *mnt,
4558 struct dentry *dentry, struct kstat *stat)
4559{
4560 struct inode *inode = dentry->d_inode;
4561 generic_fillattr(inode, stat);
4562 stat->dev = BTRFS_I(inode)->root->anon_super.s_dev;
4563 stat->blksize = PAGE_CACHE_SIZE;
4564 stat->blocks = (inode_get_bytes(inode) +
4565 BTRFS_I(inode)->delalloc_bytes) >> 9;
4566 return 0;
4567}
4568
4569static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
4570 struct inode *new_dir, struct dentry *new_dentry)
4571{
4572 struct btrfs_trans_handle *trans;
4573 struct btrfs_root *root = BTRFS_I(old_dir)->root;
4574 struct inode *new_inode = new_dentry->d_inode;
4575 struct inode *old_inode = old_dentry->d_inode;
4576 struct timespec ctime = CURRENT_TIME;
4577 u64 index = 0;
4578 int ret;
4579
4580 /* we're not allowed to rename between subvolumes */
4581 if (BTRFS_I(old_inode)->root->root_key.objectid !=
4582 BTRFS_I(new_dir)->root->root_key.objectid)
4583 return -EXDEV;
4584
4585 if (S_ISDIR(old_inode->i_mode) && new_inode &&
4586 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) {
4587 return -ENOTEMPTY;
4588 }
4589
4590 /* to rename a snapshot or subvolume, we need to juggle the
4591 * backrefs. This isn't coded yet
4592 */
4593 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
4594 return -EXDEV;
4595
4596 ret = btrfs_check_free_space(root, 1, 0);
4597 if (ret)
4598 goto out_unlock;
4599
4600 trans = btrfs_start_transaction(root, 1);
4601
4602 btrfs_set_trans_block_group(trans, new_dir);
4603
4604 btrfs_inc_nlink(old_dentry->d_inode);
4605 old_dir->i_ctime = old_dir->i_mtime = ctime;
4606 new_dir->i_ctime = new_dir->i_mtime = ctime;
4607 old_inode->i_ctime = ctime;
4608
4609 ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode,
4610 old_dentry->d_name.name,
4611 old_dentry->d_name.len);
4612 if (ret)
4613 goto out_fail;
4614
4615 if (new_inode) {
4616 new_inode->i_ctime = CURRENT_TIME;
4617 ret = btrfs_unlink_inode(trans, root, new_dir,
4618 new_dentry->d_inode,
4619 new_dentry->d_name.name,
4620 new_dentry->d_name.len);
4621 if (ret)
4622 goto out_fail;
4623 if (new_inode->i_nlink == 0) {
4624 ret = btrfs_orphan_add(trans, new_dentry->d_inode);
4625 if (ret)
4626 goto out_fail;
4627 }
4628
4629 }
4630 ret = btrfs_set_inode_index(new_dir, &index);
4631 if (ret)
4632 goto out_fail;
4633
4634 ret = btrfs_add_link(trans, new_dentry->d_parent->d_inode,
4635 old_inode, new_dentry->d_name.name,
4636 new_dentry->d_name.len, 1, index);
4637 if (ret)
4638 goto out_fail;
4639
4640out_fail:
4641 btrfs_end_transaction_throttle(trans, root);
4642out_unlock:
4643 return ret;
4644}
4645
4646/*
4647 * some fairly slow code that needs optimization. This walks the list
4648 * of all the inodes with pending delalloc and forces them to disk.
4649 */
4650int btrfs_start_delalloc_inodes(struct btrfs_root *root)
4651{
4652 struct list_head *head = &root->fs_info->delalloc_inodes;
4653 struct btrfs_inode *binode;
4654 struct inode *inode;
4655
4656 if (root->fs_info->sb->s_flags & MS_RDONLY)
4657 return -EROFS;
4658
4659 spin_lock(&root->fs_info->delalloc_lock);
4660 while (!list_empty(head)) {
4661 binode = list_entry(head->next, struct btrfs_inode,
4662 delalloc_inodes);
4663 inode = igrab(&binode->vfs_inode);
4664 if (!inode)
4665 list_del_init(&binode->delalloc_inodes);
4666 spin_unlock(&root->fs_info->delalloc_lock);
4667 if (inode) {
4668 filemap_flush(inode->i_mapping);
4669 iput(inode);
4670 }
4671 cond_resched();
4672 spin_lock(&root->fs_info->delalloc_lock);
4673 }
4674 spin_unlock(&root->fs_info->delalloc_lock);
4675
4676 /* the filemap_flush will queue IO into the worker threads, but
4677 * we have to make sure the IO is actually started and that
4678 * ordered extents get created before we return
4679 */
4680 atomic_inc(&root->fs_info->async_submit_draining);
4681 while (atomic_read(&root->fs_info->nr_async_submits) ||
4682 atomic_read(&root->fs_info->async_delalloc_pages)) {
4683 wait_event(root->fs_info->async_submit_wait,
4684 (atomic_read(&root->fs_info->nr_async_submits) == 0 &&
4685 atomic_read(&root->fs_info->async_delalloc_pages) == 0));
4686 }
4687 atomic_dec(&root->fs_info->async_submit_draining);
4688 return 0;
4689}
4690
4691static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
4692 const char *symname)
4693{
4694 struct btrfs_trans_handle *trans;
4695 struct btrfs_root *root = BTRFS_I(dir)->root;
4696 struct btrfs_path *path;
4697 struct btrfs_key key;
4698 struct inode *inode = NULL;
4699 int err;
4700 int drop_inode = 0;
4701 u64 objectid;
4702 u64 index = 0 ;
4703 int name_len;
4704 int datasize;
4705 unsigned long ptr;
4706 struct btrfs_file_extent_item *ei;
4707 struct extent_buffer *leaf;
4708 unsigned long nr = 0;
4709
4710 name_len = strlen(symname) + 1;
4711 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
4712 return -ENAMETOOLONG;
4713
4714 err = btrfs_check_free_space(root, 1, 0);
4715 if (err)
4716 goto out_fail;
4717
4718 trans = btrfs_start_transaction(root, 1);
4719 btrfs_set_trans_block_group(trans, dir);
4720
4721 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
4722 if (err) {
4723 err = -ENOSPC;
4724 goto out_unlock;
4725 }
4726
4727 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
4728 dentry->d_name.len,
4729 dentry->d_parent->d_inode->i_ino, objectid,
4730 BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO,
4731 &index);
4732 err = PTR_ERR(inode);
4733 if (IS_ERR(inode))
4734 goto out_unlock;
4735
4736 err = btrfs_init_acl(inode, dir);
4737 if (err) {
4738 drop_inode = 1;
4739 goto out_unlock;
4740 }
4741
4742 btrfs_set_trans_block_group(trans, inode);
4743 err = btrfs_add_nondir(trans, dentry, inode, 0, index);
4744 if (err)
4745 drop_inode = 1;
4746 else {
4747 inode->i_mapping->a_ops = &btrfs_aops;
4748 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
4749 inode->i_fop = &btrfs_file_operations;
4750 inode->i_op = &btrfs_file_inode_operations;
4751 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
4752 }
4753 dir->i_sb->s_dirt = 1;
4754 btrfs_update_inode_block_group(trans, inode);
4755 btrfs_update_inode_block_group(trans, dir);
4756 if (drop_inode)
4757 goto out_unlock;
4758
4759 path = btrfs_alloc_path();
4760 BUG_ON(!path);
4761 key.objectid = inode->i_ino;
4762 key.offset = 0;
4763 btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
4764 datasize = btrfs_file_extent_calc_inline_size(name_len);
4765 err = btrfs_insert_empty_item(trans, root, path, &key,
4766 datasize);
4767 if (err) {
4768 drop_inode = 1;
4769 goto out_unlock;
4770 }
4771 leaf = path->nodes[0];
4772 ei = btrfs_item_ptr(leaf, path->slots[0],
4773 struct btrfs_file_extent_item);
4774 btrfs_set_file_extent_generation(leaf, ei, trans->transid);
4775 btrfs_set_file_extent_type(leaf, ei,
4776 BTRFS_FILE_EXTENT_INLINE);
4777 btrfs_set_file_extent_encryption(leaf, ei, 0);
4778 btrfs_set_file_extent_compression(leaf, ei, 0);
4779 btrfs_set_file_extent_other_encoding(leaf, ei, 0);
4780 btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
4781
4782 ptr = btrfs_file_extent_inline_start(ei);
4783 write_extent_buffer(leaf, symname, ptr, name_len);
4784 btrfs_mark_buffer_dirty(leaf);
4785 btrfs_free_path(path);
4786
4787 inode->i_op = &btrfs_symlink_inode_operations;
4788 inode->i_mapping->a_ops = &btrfs_symlink_aops;
4789 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
4790 inode_set_bytes(inode, name_len);
4791 btrfs_i_size_write(inode, name_len - 1);
4792 err = btrfs_update_inode(trans, root, inode);
4793 if (err)
4794 drop_inode = 1;
4795
4796out_unlock:
4797 nr = trans->blocks_used;
4798 btrfs_end_transaction_throttle(trans, root);
4799out_fail:
4800 if (drop_inode) {
4801 inode_dec_link_count(inode);
4802 iput(inode);
4803 }
4804 btrfs_btree_balance_dirty(root, nr);
4805 return err;
4806}
4807
4808static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
4809 u64 alloc_hint, int mode)
4810{
4811 struct btrfs_trans_handle *trans;
4812 struct btrfs_root *root = BTRFS_I(inode)->root;
4813 struct btrfs_key ins;
4814 u64 alloc_size;
4815 u64 cur_offset = start;
4816 u64 num_bytes = end - start;
4817 int ret = 0;
4818
4819 trans = btrfs_join_transaction(root, 1);
4820 BUG_ON(!trans);
4821 btrfs_set_trans_block_group(trans, inode);
4822
4823 while (num_bytes > 0) {
4824 alloc_size = min(num_bytes, root->fs_info->max_extent);
4825 ret = btrfs_reserve_extent(trans, root, alloc_size,
4826 root->sectorsize, 0, alloc_hint,
4827 (u64)-1, &ins, 1);
4828 if (ret) {
4829 WARN_ON(1);
4830 goto out;
4831 }
4832 ret = insert_reserved_file_extent(trans, inode,
4833 cur_offset, ins.objectid,
4834 ins.offset, ins.offset,
4835 ins.offset, 0, 0, 0,
4836 BTRFS_FILE_EXTENT_PREALLOC);
4837 BUG_ON(ret);
4838 num_bytes -= ins.offset;
4839 cur_offset += ins.offset;
4840 alloc_hint = ins.objectid + ins.offset;
4841 }
4842out:
4843 if (cur_offset > start) {
4844 inode->i_ctime = CURRENT_TIME;
4845 btrfs_set_flag(inode, PREALLOC);
4846 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
4847 cur_offset > i_size_read(inode))
4848 btrfs_i_size_write(inode, cur_offset);
4849 ret = btrfs_update_inode(trans, root, inode);
4850 BUG_ON(ret);
4851 }
4852
4853 btrfs_end_transaction(trans, root);
4854 return ret;
4855}
4856
4857static long btrfs_fallocate(struct inode *inode, int mode,
4858 loff_t offset, loff_t len)
4859{
4860 u64 cur_offset;
4861 u64 last_byte;
4862 u64 alloc_start;
4863 u64 alloc_end;
4864 u64 alloc_hint = 0;
4865 u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
4866 struct extent_map *em;
4867 int ret;
4868
4869 alloc_start = offset & ~mask;
4870 alloc_end = (offset + len + mask) & ~mask;
4871
4872 mutex_lock(&inode->i_mutex);
4873 if (alloc_start > inode->i_size) {
4874 ret = btrfs_cont_expand(inode, alloc_start);
4875 if (ret)
4876 goto out;
4877 }
4878
4879 while (1) {
4880 struct btrfs_ordered_extent *ordered;
4881 lock_extent(&BTRFS_I(inode)->io_tree, alloc_start,
4882 alloc_end - 1, GFP_NOFS);
4883 ordered = btrfs_lookup_first_ordered_extent(inode,
4884 alloc_end - 1);
4885 if (ordered &&
4886 ordered->file_offset + ordered->len > alloc_start &&
4887 ordered->file_offset < alloc_end) {
4888 btrfs_put_ordered_extent(ordered);
4889 unlock_extent(&BTRFS_I(inode)->io_tree,
4890 alloc_start, alloc_end - 1, GFP_NOFS);
4891 btrfs_wait_ordered_range(inode, alloc_start,
4892 alloc_end - alloc_start);
4893 } else {
4894 if (ordered)
4895 btrfs_put_ordered_extent(ordered);
4896 break;
4897 }
4898 }
4899
4900 cur_offset = alloc_start;
4901 while (1) {
4902 em = btrfs_get_extent(inode, NULL, 0, cur_offset,
4903 alloc_end - cur_offset, 0);
4904 BUG_ON(IS_ERR(em) || !em);
4905 last_byte = min(extent_map_end(em), alloc_end);
4906 last_byte = (last_byte + mask) & ~mask;
4907 if (em->block_start == EXTENT_MAP_HOLE) {
4908 ret = prealloc_file_range(inode, cur_offset,
4909 last_byte, alloc_hint, mode);
4910 if (ret < 0) {
4911 free_extent_map(em);
4912 break;
4913 }
4914 }
4915 if (em->block_start <= EXTENT_MAP_LAST_BYTE)
4916 alloc_hint = em->block_start;
4917 free_extent_map(em);
4918
4919 cur_offset = last_byte;
4920 if (cur_offset >= alloc_end) {
4921 ret = 0;
4922 break;
4923 }
4924 }
4925 unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, alloc_end - 1,
4926 GFP_NOFS);
4927out:
4928 mutex_unlock(&inode->i_mutex);
4929 return ret;
4930}
4931
4932static int btrfs_set_page_dirty(struct page *page)
4933{
4934 return __set_page_dirty_nobuffers(page);
4935}
4936
4937static int btrfs_permission(struct inode *inode, int mask)
4938{
4939 if (btrfs_test_flag(inode, READONLY) && (mask & MAY_WRITE))
4940 return -EACCES;
4941 return generic_permission(inode, mask, btrfs_check_acl);
4942}
4943
4944static struct inode_operations btrfs_dir_inode_operations = {
4945 .getattr = btrfs_getattr,
4946 .lookup = btrfs_lookup,
4947 .create = btrfs_create,
4948 .unlink = btrfs_unlink,
4949 .link = btrfs_link,
4950 .mkdir = btrfs_mkdir,
4951 .rmdir = btrfs_rmdir,
4952 .rename = btrfs_rename,
4953 .symlink = btrfs_symlink,
4954 .setattr = btrfs_setattr,
4955 .mknod = btrfs_mknod,
4956 .setxattr = btrfs_setxattr,
4957 .getxattr = btrfs_getxattr,
4958 .listxattr = btrfs_listxattr,
4959 .removexattr = btrfs_removexattr,
4960 .permission = btrfs_permission,
4961};
4962static struct inode_operations btrfs_dir_ro_inode_operations = {
4963 .lookup = btrfs_lookup,
4964 .permission = btrfs_permission,
4965};
4966static struct file_operations btrfs_dir_file_operations = {
4967 .llseek = generic_file_llseek,
4968 .read = generic_read_dir,
4969 .readdir = btrfs_real_readdir,
4970 .unlocked_ioctl = btrfs_ioctl,
4971#ifdef CONFIG_COMPAT
4972 .compat_ioctl = btrfs_ioctl,
4973#endif
4974 .release = btrfs_release_file,
4975 .fsync = btrfs_sync_file,
4976};
4977
4978static struct extent_io_ops btrfs_extent_io_ops = {
4979 .fill_delalloc = run_delalloc_range,
4980 .submit_bio_hook = btrfs_submit_bio_hook,
4981 .merge_bio_hook = btrfs_merge_bio_hook,
4982 .readpage_end_io_hook = btrfs_readpage_end_io_hook,
4983 .writepage_end_io_hook = btrfs_writepage_end_io_hook,
4984 .writepage_start_hook = btrfs_writepage_start_hook,
4985 .readpage_io_failed_hook = btrfs_io_failed_hook,
4986 .set_bit_hook = btrfs_set_bit_hook,
4987 .clear_bit_hook = btrfs_clear_bit_hook,
4988};
4989
4990static struct address_space_operations btrfs_aops = {
4991 .readpage = btrfs_readpage,
4992 .writepage = btrfs_writepage,
4993 .writepages = btrfs_writepages,
4994 .readpages = btrfs_readpages,
4995 .sync_page = block_sync_page,
4996 .bmap = btrfs_bmap,
4997 .direct_IO = btrfs_direct_IO,
4998 .invalidatepage = btrfs_invalidatepage,
4999 .releasepage = btrfs_releasepage,
5000 .set_page_dirty = btrfs_set_page_dirty,
5001};
5002
5003static struct address_space_operations btrfs_symlink_aops = {
5004 .readpage = btrfs_readpage,
5005 .writepage = btrfs_writepage,
5006 .invalidatepage = btrfs_invalidatepage,
5007 .releasepage = btrfs_releasepage,
5008};
5009
5010static struct inode_operations btrfs_file_inode_operations = {
5011 .truncate = btrfs_truncate,
5012 .getattr = btrfs_getattr,
5013 .setattr = btrfs_setattr,
5014 .setxattr = btrfs_setxattr,
5015 .getxattr = btrfs_getxattr,
5016 .listxattr = btrfs_listxattr,
5017 .removexattr = btrfs_removexattr,
5018 .permission = btrfs_permission,
5019 .fallocate = btrfs_fallocate,
5020};
5021static struct inode_operations btrfs_special_inode_operations = {
5022 .getattr = btrfs_getattr,
5023 .setattr = btrfs_setattr,
5024 .permission = btrfs_permission,
5025 .setxattr = btrfs_setxattr,
5026 .getxattr = btrfs_getxattr,
5027 .listxattr = btrfs_listxattr,
5028 .removexattr = btrfs_removexattr,
5029};
5030static struct inode_operations btrfs_symlink_inode_operations = {
5031 .readlink = generic_readlink,
5032 .follow_link = page_follow_link_light,
5033 .put_link = page_put_link,
5034 .permission = btrfs_permission,
5035};
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
new file mode 100644
index 000000000000..c2aa33e3feb5
--- /dev/null
+++ b/fs/btrfs/ioctl.c
@@ -0,0 +1,1132 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/kernel.h>
20#include <linux/bio.h>
21#include <linux/buffer_head.h>
22#include <linux/file.h>
23#include <linux/fs.h>
24#include <linux/fsnotify.h>
25#include <linux/pagemap.h>
26#include <linux/highmem.h>
27#include <linux/time.h>
28#include <linux/init.h>
29#include <linux/string.h>
30#include <linux/smp_lock.h>
31#include <linux/backing-dev.h>
32#include <linux/mount.h>
33#include <linux/mpage.h>
34#include <linux/namei.h>
35#include <linux/swap.h>
36#include <linux/writeback.h>
37#include <linux/statfs.h>
38#include <linux/compat.h>
39#include <linux/bit_spinlock.h>
40#include <linux/security.h>
41#include <linux/version.h>
42#include <linux/xattr.h>
43#include <linux/vmalloc.h>
44#include "compat.h"
45#include "ctree.h"
46#include "disk-io.h"
47#include "transaction.h"
48#include "btrfs_inode.h"
49#include "ioctl.h"
50#include "print-tree.h"
51#include "volumes.h"
52#include "locking.h"
53
54
55
56static noinline int create_subvol(struct btrfs_root *root,
57 struct dentry *dentry,
58 char *name, int namelen)
59{
60 struct btrfs_trans_handle *trans;
61 struct btrfs_key key;
62 struct btrfs_root_item root_item;
63 struct btrfs_inode_item *inode_item;
64 struct extent_buffer *leaf;
65 struct btrfs_root *new_root = root;
66 struct inode *dir;
67 int ret;
68 int err;
69 u64 objectid;
70 u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
71 u64 index = 0;
72 unsigned long nr = 1;
73
74 ret = btrfs_check_free_space(root, 1, 0);
75 if (ret)
76 goto fail_commit;
77
78 trans = btrfs_start_transaction(root, 1);
79 BUG_ON(!trans);
80
81 ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root,
82 0, &objectid);
83 if (ret)
84 goto fail;
85
86 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
87 objectid, trans->transid, 0, 0, 0);
88 if (IS_ERR(leaf)) {
89 ret = PTR_ERR(leaf);
90 goto fail;
91 }
92
93 btrfs_set_header_nritems(leaf, 0);
94 btrfs_set_header_level(leaf, 0);
95 btrfs_set_header_bytenr(leaf, leaf->start);
96 btrfs_set_header_generation(leaf, trans->transid);
97 btrfs_set_header_owner(leaf, objectid);
98
99 write_extent_buffer(leaf, root->fs_info->fsid,
100 (unsigned long)btrfs_header_fsid(leaf),
101 BTRFS_FSID_SIZE);
102 btrfs_mark_buffer_dirty(leaf);
103
104 inode_item = &root_item.inode;
105 memset(inode_item, 0, sizeof(*inode_item));
106 inode_item->generation = cpu_to_le64(1);
107 inode_item->size = cpu_to_le64(3);
108 inode_item->nlink = cpu_to_le32(1);
109 inode_item->nbytes = cpu_to_le64(root->leafsize);
110 inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
111
112 btrfs_set_root_bytenr(&root_item, leaf->start);
113 btrfs_set_root_generation(&root_item, trans->transid);
114 btrfs_set_root_level(&root_item, 0);
115 btrfs_set_root_refs(&root_item, 1);
116 btrfs_set_root_used(&root_item, 0);
117 btrfs_set_root_last_snapshot(&root_item, 0);
118
119 memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress));
120 root_item.drop_level = 0;
121
122 btrfs_tree_unlock(leaf);
123 free_extent_buffer(leaf);
124 leaf = NULL;
125
126 btrfs_set_root_dirid(&root_item, new_dirid);
127
128 key.objectid = objectid;
129 key.offset = 1;
130 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
131 ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
132 &root_item);
133 if (ret)
134 goto fail;
135
136 /*
137 * insert the directory item
138 */
139 key.offset = (u64)-1;
140 dir = dentry->d_parent->d_inode;
141 ret = btrfs_set_inode_index(dir, &index);
142 BUG_ON(ret);
143
144 ret = btrfs_insert_dir_item(trans, root,
145 name, namelen, dir->i_ino, &key,
146 BTRFS_FT_DIR, index);
147 if (ret)
148 goto fail;
149
150 btrfs_i_size_write(dir, dir->i_size + namelen * 2);
151 ret = btrfs_update_inode(trans, root, dir);
152 BUG_ON(ret);
153
154 /* add the backref first */
155 ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
156 objectid, BTRFS_ROOT_BACKREF_KEY,
157 root->root_key.objectid,
158 dir->i_ino, index, name, namelen);
159
160 BUG_ON(ret);
161
162 /* now add the forward ref */
163 ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
164 root->root_key.objectid, BTRFS_ROOT_REF_KEY,
165 objectid,
166 dir->i_ino, index, name, namelen);
167
168 BUG_ON(ret);
169
170 ret = btrfs_commit_transaction(trans, root);
171 if (ret)
172 goto fail_commit;
173
174 new_root = btrfs_read_fs_root_no_name(root->fs_info, &key);
175 BUG_ON(!new_root);
176
177 trans = btrfs_start_transaction(new_root, 1);
178 BUG_ON(!trans);
179
180 ret = btrfs_create_subvol_root(trans, new_root, dentry, new_dirid,
181 BTRFS_I(dir)->block_group);
182 if (ret)
183 goto fail;
184
185fail:
186 nr = trans->blocks_used;
187 err = btrfs_commit_transaction(trans, new_root);
188 if (err && !ret)
189 ret = err;
190fail_commit:
191 btrfs_btree_balance_dirty(root, nr);
192 return ret;
193}
194
195static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
196 char *name, int namelen)
197{
198 struct btrfs_pending_snapshot *pending_snapshot;
199 struct btrfs_trans_handle *trans;
200 int ret = 0;
201 int err;
202 unsigned long nr = 0;
203
204 if (!root->ref_cows)
205 return -EINVAL;
206
207 ret = btrfs_check_free_space(root, 1, 0);
208 if (ret)
209 goto fail_unlock;
210
211 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
212 if (!pending_snapshot) {
213 ret = -ENOMEM;
214 goto fail_unlock;
215 }
216 pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS);
217 if (!pending_snapshot->name) {
218 ret = -ENOMEM;
219 kfree(pending_snapshot);
220 goto fail_unlock;
221 }
222 memcpy(pending_snapshot->name, name, namelen);
223 pending_snapshot->name[namelen] = '\0';
224 pending_snapshot->dentry = dentry;
225 trans = btrfs_start_transaction(root, 1);
226 BUG_ON(!trans);
227 pending_snapshot->root = root;
228 list_add(&pending_snapshot->list,
229 &trans->transaction->pending_snapshots);
230 err = btrfs_commit_transaction(trans, root);
231
232fail_unlock:
233 btrfs_btree_balance_dirty(root, nr);
234 return ret;
235}
236
237/* copy of may_create in fs/namei.c() */
238static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
239{
240 if (child->d_inode)
241 return -EEXIST;
242 if (IS_DEADDIR(dir))
243 return -ENOENT;
244 return inode_permission(dir, MAY_WRITE | MAY_EXEC);
245}
246
247/*
248 * Create a new subvolume below @parent. This is largely modeled after
249 * sys_mkdirat and vfs_mkdir, but we only do a single component lookup
250 * inside this filesystem so it's quite a bit simpler.
251 */
252static noinline int btrfs_mksubvol(struct path *parent, char *name,
253 int mode, int namelen,
254 struct btrfs_root *snap_src)
255{
256 struct dentry *dentry;
257 int error;
258
259 mutex_lock_nested(&parent->dentry->d_inode->i_mutex, I_MUTEX_PARENT);
260
261 dentry = lookup_one_len(name, parent->dentry, namelen);
262 error = PTR_ERR(dentry);
263 if (IS_ERR(dentry))
264 goto out_unlock;
265
266 error = -EEXIST;
267 if (dentry->d_inode)
268 goto out_dput;
269
270 if (!IS_POSIXACL(parent->dentry->d_inode))
271 mode &= ~current->fs->umask;
272
273 error = mnt_want_write(parent->mnt);
274 if (error)
275 goto out_dput;
276
277 error = btrfs_may_create(parent->dentry->d_inode, dentry);
278 if (error)
279 goto out_drop_write;
280
281 /*
282 * Actually perform the low-level subvolume creation after all
283 * this VFS fuzz.
284 *
285 * Eventually we want to pass in an inode under which we create this
286 * subvolume, but for now all are under the filesystem root.
287 *
288 * Also we should pass on the mode eventually to allow creating new
289 * subvolume with specific mode bits.
290 */
291 if (snap_src) {
292 struct dentry *dir = dentry->d_parent;
293 struct dentry *test = dir->d_parent;
294 struct btrfs_path *path = btrfs_alloc_path();
295 int ret;
296 u64 test_oid;
297 u64 parent_oid = BTRFS_I(dir->d_inode)->root->root_key.objectid;
298
299 test_oid = snap_src->root_key.objectid;
300
301 ret = btrfs_find_root_ref(snap_src->fs_info->tree_root,
302 path, parent_oid, test_oid);
303 if (ret == 0)
304 goto create;
305 btrfs_release_path(snap_src->fs_info->tree_root, path);
306
307 /* we need to make sure we aren't creating a directory loop
308 * by taking a snapshot of something that has our current
309 * subvol in its directory tree. So, this loops through
310 * the dentries and checks the forward refs for each subvolume
311 * to see if is references the subvolume where we are
312 * placing this new snapshot.
313 */
314 while (1) {
315 if (!test ||
316 dir == snap_src->fs_info->sb->s_root ||
317 test == snap_src->fs_info->sb->s_root ||
318 test->d_inode->i_sb != snap_src->fs_info->sb) {
319 break;
320 }
321 if (S_ISLNK(test->d_inode->i_mode)) {
322 printk(KERN_INFO "Btrfs symlink in snapshot "
323 "path, failed\n");
324 error = -EMLINK;
325 btrfs_free_path(path);
326 goto out_drop_write;
327 }
328 test_oid =
329 BTRFS_I(test->d_inode)->root->root_key.objectid;
330 ret = btrfs_find_root_ref(snap_src->fs_info->tree_root,
331 path, test_oid, parent_oid);
332 if (ret == 0) {
333 printk(KERN_INFO "Btrfs snapshot creation "
334 "failed, looping\n");
335 error = -EMLINK;
336 btrfs_free_path(path);
337 goto out_drop_write;
338 }
339 btrfs_release_path(snap_src->fs_info->tree_root, path);
340 test = test->d_parent;
341 }
342create:
343 btrfs_free_path(path);
344 error = create_snapshot(snap_src, dentry, name, namelen);
345 } else {
346 error = create_subvol(BTRFS_I(parent->dentry->d_inode)->root,
347 dentry, name, namelen);
348 }
349 if (error)
350 goto out_drop_write;
351
352 fsnotify_mkdir(parent->dentry->d_inode, dentry);
353out_drop_write:
354 mnt_drop_write(parent->mnt);
355out_dput:
356 dput(dentry);
357out_unlock:
358 mutex_unlock(&parent->dentry->d_inode->i_mutex);
359 return error;
360}
361
362
363static int btrfs_defrag_file(struct file *file)
364{
365 struct inode *inode = fdentry(file)->d_inode;
366 struct btrfs_root *root = BTRFS_I(inode)->root;
367 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
368 struct btrfs_ordered_extent *ordered;
369 struct page *page;
370 unsigned long last_index;
371 unsigned long ra_pages = root->fs_info->bdi.ra_pages;
372 unsigned long total_read = 0;
373 u64 page_start;
374 u64 page_end;
375 unsigned long i;
376 int ret;
377
378 ret = btrfs_check_free_space(root, inode->i_size, 0);
379 if (ret)
380 return -ENOSPC;
381
382 mutex_lock(&inode->i_mutex);
383 last_index = inode->i_size >> PAGE_CACHE_SHIFT;
384 for (i = 0; i <= last_index; i++) {
385 if (total_read % ra_pages == 0) {
386 btrfs_force_ra(inode->i_mapping, &file->f_ra, file, i,
387 min(last_index, i + ra_pages - 1));
388 }
389 total_read++;
390again:
391 page = grab_cache_page(inode->i_mapping, i);
392 if (!page)
393 goto out_unlock;
394 if (!PageUptodate(page)) {
395 btrfs_readpage(NULL, page);
396 lock_page(page);
397 if (!PageUptodate(page)) {
398 unlock_page(page);
399 page_cache_release(page);
400 goto out_unlock;
401 }
402 }
403
404 wait_on_page_writeback(page);
405
406 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
407 page_end = page_start + PAGE_CACHE_SIZE - 1;
408 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
409
410 ordered = btrfs_lookup_ordered_extent(inode, page_start);
411 if (ordered) {
412 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
413 unlock_page(page);
414 page_cache_release(page);
415 btrfs_start_ordered_extent(inode, ordered, 1);
416 btrfs_put_ordered_extent(ordered);
417 goto again;
418 }
419 set_page_extent_mapped(page);
420
421 /*
422 * this makes sure page_mkwrite is called on the
423 * page if it is dirtied again later
424 */
425 clear_page_dirty_for_io(page);
426
427 btrfs_set_extent_delalloc(inode, page_start, page_end);
428
429 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
430 set_page_dirty(page);
431 unlock_page(page);
432 page_cache_release(page);
433 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
434 }
435
436out_unlock:
437 mutex_unlock(&inode->i_mutex);
438 return 0;
439}
440
441/*
442 * Called inside transaction, so use GFP_NOFS
443 */
444
445static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
446{
447 u64 new_size;
448 u64 old_size;
449 u64 devid = 1;
450 struct btrfs_ioctl_vol_args *vol_args;
451 struct btrfs_trans_handle *trans;
452 struct btrfs_device *device = NULL;
453 char *sizestr;
454 char *devstr = NULL;
455 int ret = 0;
456 int namelen;
457 int mod = 0;
458
459 if (root->fs_info->sb->s_flags & MS_RDONLY)
460 return -EROFS;
461
462 if (!capable(CAP_SYS_ADMIN))
463 return -EPERM;
464
465 vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
466
467 if (!vol_args)
468 return -ENOMEM;
469
470 if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
471 ret = -EFAULT;
472 goto out;
473 }
474
475 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
476 namelen = strlen(vol_args->name);
477
478 mutex_lock(&root->fs_info->volume_mutex);
479 sizestr = vol_args->name;
480 devstr = strchr(sizestr, ':');
481 if (devstr) {
482 char *end;
483 sizestr = devstr + 1;
484 *devstr = '\0';
485 devstr = vol_args->name;
486 devid = simple_strtoull(devstr, &end, 10);
487 printk(KERN_INFO "resizing devid %llu\n", devid);
488 }
489 device = btrfs_find_device(root, devid, NULL, NULL);
490 if (!device) {
491 printk(KERN_INFO "resizer unable to find device %llu\n", devid);
492 ret = -EINVAL;
493 goto out_unlock;
494 }
495 if (!strcmp(sizestr, "max"))
496 new_size = device->bdev->bd_inode->i_size;
497 else {
498 if (sizestr[0] == '-') {
499 mod = -1;
500 sizestr++;
501 } else if (sizestr[0] == '+') {
502 mod = 1;
503 sizestr++;
504 }
505 new_size = btrfs_parse_size(sizestr);
506 if (new_size == 0) {
507 ret = -EINVAL;
508 goto out_unlock;
509 }
510 }
511
512 old_size = device->total_bytes;
513
514 if (mod < 0) {
515 if (new_size > old_size) {
516 ret = -EINVAL;
517 goto out_unlock;
518 }
519 new_size = old_size - new_size;
520 } else if (mod > 0) {
521 new_size = old_size + new_size;
522 }
523
524 if (new_size < 256 * 1024 * 1024) {
525 ret = -EINVAL;
526 goto out_unlock;
527 }
528 if (new_size > device->bdev->bd_inode->i_size) {
529 ret = -EFBIG;
530 goto out_unlock;
531 }
532
533 do_div(new_size, root->sectorsize);
534 new_size *= root->sectorsize;
535
536 printk(KERN_INFO "new size for %s is %llu\n",
537 device->name, (unsigned long long)new_size);
538
539 if (new_size > old_size) {
540 trans = btrfs_start_transaction(root, 1);
541 ret = btrfs_grow_device(trans, device, new_size);
542 btrfs_commit_transaction(trans, root);
543 } else {
544 ret = btrfs_shrink_device(device, new_size);
545 }
546
547out_unlock:
548 mutex_unlock(&root->fs_info->volume_mutex);
549out:
550 kfree(vol_args);
551 return ret;
552}
553
554static noinline int btrfs_ioctl_snap_create(struct file *file,
555 void __user *arg, int subvol)
556{
557 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
558 struct btrfs_ioctl_vol_args *vol_args;
559 struct btrfs_dir_item *di;
560 struct btrfs_path *path;
561 struct file *src_file;
562 u64 root_dirid;
563 int namelen;
564 int ret = 0;
565
566 if (root->fs_info->sb->s_flags & MS_RDONLY)
567 return -EROFS;
568
569 vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
570
571 if (!vol_args)
572 return -ENOMEM;
573
574 if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
575 ret = -EFAULT;
576 goto out;
577 }
578
579 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
580 namelen = strlen(vol_args->name);
581 if (strchr(vol_args->name, '/')) {
582 ret = -EINVAL;
583 goto out;
584 }
585
586 path = btrfs_alloc_path();
587 if (!path) {
588 ret = -ENOMEM;
589 goto out;
590 }
591
592 root_dirid = root->fs_info->sb->s_root->d_inode->i_ino,
593 di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root,
594 path, root_dirid,
595 vol_args->name, namelen, 0);
596 btrfs_free_path(path);
597
598 if (di && !IS_ERR(di)) {
599 ret = -EEXIST;
600 goto out;
601 }
602
603 if (IS_ERR(di)) {
604 ret = PTR_ERR(di);
605 goto out;
606 }
607
608 if (subvol) {
609 ret = btrfs_mksubvol(&file->f_path, vol_args->name,
610 file->f_path.dentry->d_inode->i_mode,
611 namelen, NULL);
612 } else {
613 struct inode *src_inode;
614 src_file = fget(vol_args->fd);
615 if (!src_file) {
616 ret = -EINVAL;
617 goto out;
618 }
619
620 src_inode = src_file->f_path.dentry->d_inode;
621 if (src_inode->i_sb != file->f_path.dentry->d_inode->i_sb) {
622 printk(KERN_INFO "btrfs: Snapshot src from "
623 "another FS\n");
624 ret = -EINVAL;
625 fput(src_file);
626 goto out;
627 }
628 ret = btrfs_mksubvol(&file->f_path, vol_args->name,
629 file->f_path.dentry->d_inode->i_mode,
630 namelen, BTRFS_I(src_inode)->root);
631 fput(src_file);
632 }
633
634out:
635 kfree(vol_args);
636 return ret;
637}
638
639static int btrfs_ioctl_defrag(struct file *file)
640{
641 struct inode *inode = fdentry(file)->d_inode;
642 struct btrfs_root *root = BTRFS_I(inode)->root;
643 int ret;
644
645 ret = mnt_want_write(file->f_path.mnt);
646 if (ret)
647 return ret;
648
649 switch (inode->i_mode & S_IFMT) {
650 case S_IFDIR:
651 if (!capable(CAP_SYS_ADMIN)) {
652 ret = -EPERM;
653 goto out;
654 }
655 btrfs_defrag_root(root, 0);
656 btrfs_defrag_root(root->fs_info->extent_root, 0);
657 break;
658 case S_IFREG:
659 if (!(file->f_mode & FMODE_WRITE)) {
660 ret = -EINVAL;
661 goto out;
662 }
663 btrfs_defrag_file(file);
664 break;
665 }
666out:
667 mnt_drop_write(file->f_path.mnt);
668 return ret;
669}
670
671static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
672{
673 struct btrfs_ioctl_vol_args *vol_args;
674 int ret;
675
676 if (!capable(CAP_SYS_ADMIN))
677 return -EPERM;
678
679 vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
680
681 if (!vol_args)
682 return -ENOMEM;
683
684 if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
685 ret = -EFAULT;
686 goto out;
687 }
688 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
689 ret = btrfs_init_new_device(root, vol_args->name);
690
691out:
692 kfree(vol_args);
693 return ret;
694}
695
696static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
697{
698 struct btrfs_ioctl_vol_args *vol_args;
699 int ret;
700
701 if (!capable(CAP_SYS_ADMIN))
702 return -EPERM;
703
704 if (root->fs_info->sb->s_flags & MS_RDONLY)
705 return -EROFS;
706
707 vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
708
709 if (!vol_args)
710 return -ENOMEM;
711
712 if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
713 ret = -EFAULT;
714 goto out;
715 }
716 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
717 ret = btrfs_rm_device(root, vol_args->name);
718
719out:
720 kfree(vol_args);
721 return ret;
722}
723
724static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
725 u64 off, u64 olen, u64 destoff)
726{
727 struct inode *inode = fdentry(file)->d_inode;
728 struct btrfs_root *root = BTRFS_I(inode)->root;
729 struct file *src_file;
730 struct inode *src;
731 struct btrfs_trans_handle *trans;
732 struct btrfs_path *path;
733 struct extent_buffer *leaf;
734 char *buf;
735 struct btrfs_key key;
736 u32 nritems;
737 int slot;
738 int ret;
739 u64 len = olen;
740 u64 bs = root->fs_info->sb->s_blocksize;
741 u64 hint_byte;
742
743 /*
744 * TODO:
745 * - split compressed inline extents. annoying: we need to
746 * decompress into destination's address_space (the file offset
747 * may change, so source mapping won't do), then recompress (or
748 * otherwise reinsert) a subrange.
749 * - allow ranges within the same file to be cloned (provided
750 * they don't overlap)?
751 */
752
753 /* the destination must be opened for writing */
754 if (!(file->f_mode & FMODE_WRITE))
755 return -EINVAL;
756
757 ret = mnt_want_write(file->f_path.mnt);
758 if (ret)
759 return ret;
760
761 src_file = fget(srcfd);
762 if (!src_file) {
763 ret = -EBADF;
764 goto out_drop_write;
765 }
766 src = src_file->f_dentry->d_inode;
767
768 ret = -EINVAL;
769 if (src == inode)
770 goto out_fput;
771
772 ret = -EISDIR;
773 if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode))
774 goto out_fput;
775
776 ret = -EXDEV;
777 if (src->i_sb != inode->i_sb || BTRFS_I(src)->root != root)
778 goto out_fput;
779
780 ret = -ENOMEM;
781 buf = vmalloc(btrfs_level_size(root, 0));
782 if (!buf)
783 goto out_fput;
784
785 path = btrfs_alloc_path();
786 if (!path) {
787 vfree(buf);
788 goto out_fput;
789 }
790 path->reada = 2;
791
792 if (inode < src) {
793 mutex_lock(&inode->i_mutex);
794 mutex_lock(&src->i_mutex);
795 } else {
796 mutex_lock(&src->i_mutex);
797 mutex_lock(&inode->i_mutex);
798 }
799
800 /* determine range to clone */
801 ret = -EINVAL;
802 if (off >= src->i_size || off + len > src->i_size)
803 goto out_unlock;
804 if (len == 0)
805 olen = len = src->i_size - off;
806 /* if we extend to eof, continue to block boundary */
807 if (off + len == src->i_size)
808 len = ((src->i_size + bs-1) & ~(bs-1))
809 - off;
810
811 /* verify the end result is block aligned */
812 if ((off & (bs-1)) ||
813 ((off + len) & (bs-1)))
814 goto out_unlock;
815
816 /* do any pending delalloc/csum calc on src, one way or
817 another, and lock file content */
818 while (1) {
819 struct btrfs_ordered_extent *ordered;
820 lock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
821 ordered = btrfs_lookup_first_ordered_extent(inode, off+len);
822 if (BTRFS_I(src)->delalloc_bytes == 0 && !ordered)
823 break;
824 unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
825 if (ordered)
826 btrfs_put_ordered_extent(ordered);
827 btrfs_wait_ordered_range(src, off, off+len);
828 }
829
830 trans = btrfs_start_transaction(root, 1);
831 BUG_ON(!trans);
832
833 /* punch hole in destination first */
834 btrfs_drop_extents(trans, root, inode, off, off+len, 0, &hint_byte);
835
836 /* clone data */
837 key.objectid = src->i_ino;
838 key.type = BTRFS_EXTENT_DATA_KEY;
839 key.offset = 0;
840
841 while (1) {
842 /*
843 * note the key will change type as we walk through the
844 * tree.
845 */
846 ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
847 if (ret < 0)
848 goto out;
849
850 nritems = btrfs_header_nritems(path->nodes[0]);
851 if (path->slots[0] >= nritems) {
852 ret = btrfs_next_leaf(root, path);
853 if (ret < 0)
854 goto out;
855 if (ret > 0)
856 break;
857 nritems = btrfs_header_nritems(path->nodes[0]);
858 }
859 leaf = path->nodes[0];
860 slot = path->slots[0];
861
862 btrfs_item_key_to_cpu(leaf, &key, slot);
863 if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY ||
864 key.objectid != src->i_ino)
865 break;
866
867 if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
868 struct btrfs_file_extent_item *extent;
869 int type;
870 u32 size;
871 struct btrfs_key new_key;
872 u64 disko = 0, diskl = 0;
873 u64 datao = 0, datal = 0;
874 u8 comp;
875
876 size = btrfs_item_size_nr(leaf, slot);
877 read_extent_buffer(leaf, buf,
878 btrfs_item_ptr_offset(leaf, slot),
879 size);
880
881 extent = btrfs_item_ptr(leaf, slot,
882 struct btrfs_file_extent_item);
883 comp = btrfs_file_extent_compression(leaf, extent);
884 type = btrfs_file_extent_type(leaf, extent);
885 if (type == BTRFS_FILE_EXTENT_REG) {
886 disko = btrfs_file_extent_disk_bytenr(leaf,
887 extent);
888 diskl = btrfs_file_extent_disk_num_bytes(leaf,
889 extent);
890 datao = btrfs_file_extent_offset(leaf, extent);
891 datal = btrfs_file_extent_num_bytes(leaf,
892 extent);
893 } else if (type == BTRFS_FILE_EXTENT_INLINE) {
894 /* take upper bound, may be compressed */
895 datal = btrfs_file_extent_ram_bytes(leaf,
896 extent);
897 }
898 btrfs_release_path(root, path);
899
900 if (key.offset + datal < off ||
901 key.offset >= off+len)
902 goto next;
903
904 memcpy(&new_key, &key, sizeof(new_key));
905 new_key.objectid = inode->i_ino;
906 new_key.offset = key.offset + destoff - off;
907
908 if (type == BTRFS_FILE_EXTENT_REG) {
909 ret = btrfs_insert_empty_item(trans, root, path,
910 &new_key, size);
911 if (ret)
912 goto out;
913
914 leaf = path->nodes[0];
915 slot = path->slots[0];
916 write_extent_buffer(leaf, buf,
917 btrfs_item_ptr_offset(leaf, slot),
918 size);
919
920 extent = btrfs_item_ptr(leaf, slot,
921 struct btrfs_file_extent_item);
922
923 if (off > key.offset) {
924 datao += off - key.offset;
925 datal -= off - key.offset;
926 }
927 if (key.offset + datao + datal + key.offset >
928 off + len)
929 datal = off + len - key.offset - datao;
930 /* disko == 0 means it's a hole */
931 if (!disko)
932 datao = 0;
933
934 btrfs_set_file_extent_offset(leaf, extent,
935 datao);
936 btrfs_set_file_extent_num_bytes(leaf, extent,
937 datal);
938 if (disko) {
939 inode_add_bytes(inode, datal);
940 ret = btrfs_inc_extent_ref(trans, root,
941 disko, diskl, leaf->start,
942 root->root_key.objectid,
943 trans->transid,
944 inode->i_ino);
945 BUG_ON(ret);
946 }
947 } else if (type == BTRFS_FILE_EXTENT_INLINE) {
948 u64 skip = 0;
949 u64 trim = 0;
950 if (off > key.offset) {
951 skip = off - key.offset;
952 new_key.offset += skip;
953 }
954
955 if (key.offset + datal > off+len)
956 trim = key.offset + datal - (off+len);
957
958 if (comp && (skip || trim)) {
959 ret = -EINVAL;
960 goto out;
961 }
962 size -= skip + trim;
963 datal -= skip + trim;
964 ret = btrfs_insert_empty_item(trans, root, path,
965 &new_key, size);
966 if (ret)
967 goto out;
968
969 if (skip) {
970 u32 start =
971 btrfs_file_extent_calc_inline_size(0);
972 memmove(buf+start, buf+start+skip,
973 datal);
974 }
975
976 leaf = path->nodes[0];
977 slot = path->slots[0];
978 write_extent_buffer(leaf, buf,
979 btrfs_item_ptr_offset(leaf, slot),
980 size);
981 inode_add_bytes(inode, datal);
982 }
983
984 btrfs_mark_buffer_dirty(leaf);
985 }
986
987next:
988 btrfs_release_path(root, path);
989 key.offset++;
990 }
991 ret = 0;
992out:
993 btrfs_release_path(root, path);
994 if (ret == 0) {
995 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
996 if (destoff + olen > inode->i_size)
997 btrfs_i_size_write(inode, destoff + olen);
998 BTRFS_I(inode)->flags = BTRFS_I(src)->flags;
999 ret = btrfs_update_inode(trans, root, inode);
1000 }
1001 btrfs_end_transaction(trans, root);
1002 unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
1003 if (ret)
1004 vmtruncate(inode, 0);
1005out_unlock:
1006 mutex_unlock(&src->i_mutex);
1007 mutex_unlock(&inode->i_mutex);
1008 vfree(buf);
1009 btrfs_free_path(path);
1010out_fput:
1011 fput(src_file);
1012out_drop_write:
1013 mnt_drop_write(file->f_path.mnt);
1014 return ret;
1015}
1016
1017static long btrfs_ioctl_clone_range(struct file *file, void __user *argp)
1018{
1019 struct btrfs_ioctl_clone_range_args args;
1020
1021 if (copy_from_user(&args, argp, sizeof(args)))
1022 return -EFAULT;
1023 return btrfs_ioctl_clone(file, args.src_fd, args.src_offset,
1024 args.src_length, args.dest_offset);
1025}
1026
1027/*
1028 * there are many ways the trans_start and trans_end ioctls can lead
1029 * to deadlocks. They should only be used by applications that
1030 * basically own the machine, and have a very in depth understanding
1031 * of all the possible deadlocks and enospc problems.
1032 */
1033static long btrfs_ioctl_trans_start(struct file *file)
1034{
1035 struct inode *inode = fdentry(file)->d_inode;
1036 struct btrfs_root *root = BTRFS_I(inode)->root;
1037 struct btrfs_trans_handle *trans;
1038 int ret = 0;
1039
1040 if (!capable(CAP_SYS_ADMIN))
1041 return -EPERM;
1042
1043 if (file->private_data) {
1044 ret = -EINPROGRESS;
1045 goto out;
1046 }
1047
1048 ret = mnt_want_write(file->f_path.mnt);
1049 if (ret)
1050 goto out;
1051
1052 mutex_lock(&root->fs_info->trans_mutex);
1053 root->fs_info->open_ioctl_trans++;
1054 mutex_unlock(&root->fs_info->trans_mutex);
1055
1056 trans = btrfs_start_ioctl_transaction(root, 0);
1057 if (trans)
1058 file->private_data = trans;
1059 else
1060 ret = -ENOMEM;
1061 /*printk(KERN_INFO "btrfs_ioctl_trans_start on %p\n", file);*/
1062out:
1063 return ret;
1064}
1065
1066/*
1067 * there are many ways the trans_start and trans_end ioctls can lead
1068 * to deadlocks. They should only be used by applications that
1069 * basically own the machine, and have a very in depth understanding
1070 * of all the possible deadlocks and enospc problems.
1071 */
1072long btrfs_ioctl_trans_end(struct file *file)
1073{
1074 struct inode *inode = fdentry(file)->d_inode;
1075 struct btrfs_root *root = BTRFS_I(inode)->root;
1076 struct btrfs_trans_handle *trans;
1077 int ret = 0;
1078
1079 trans = file->private_data;
1080 if (!trans) {
1081 ret = -EINVAL;
1082 goto out;
1083 }
1084 btrfs_end_transaction(trans, root);
1085 file->private_data = NULL;
1086
1087 mutex_lock(&root->fs_info->trans_mutex);
1088 root->fs_info->open_ioctl_trans--;
1089 mutex_unlock(&root->fs_info->trans_mutex);
1090
1091 mnt_drop_write(file->f_path.mnt);
1092
1093out:
1094 return ret;
1095}
1096
1097long btrfs_ioctl(struct file *file, unsigned int
1098 cmd, unsigned long arg)
1099{
1100 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
1101 void __user *argp = (void __user *)arg;
1102
1103 switch (cmd) {
1104 case BTRFS_IOC_SNAP_CREATE:
1105 return btrfs_ioctl_snap_create(file, argp, 0);
1106 case BTRFS_IOC_SUBVOL_CREATE:
1107 return btrfs_ioctl_snap_create(file, argp, 1);
1108 case BTRFS_IOC_DEFRAG:
1109 return btrfs_ioctl_defrag(file);
1110 case BTRFS_IOC_RESIZE:
1111 return btrfs_ioctl_resize(root, argp);
1112 case BTRFS_IOC_ADD_DEV:
1113 return btrfs_ioctl_add_dev(root, argp);
1114 case BTRFS_IOC_RM_DEV:
1115 return btrfs_ioctl_rm_dev(root, argp);
1116 case BTRFS_IOC_BALANCE:
1117 return btrfs_balance(root->fs_info->dev_root);
1118 case BTRFS_IOC_CLONE:
1119 return btrfs_ioctl_clone(file, arg, 0, 0, 0);
1120 case BTRFS_IOC_CLONE_RANGE:
1121 return btrfs_ioctl_clone_range(file, argp);
1122 case BTRFS_IOC_TRANS_START:
1123 return btrfs_ioctl_trans_start(file);
1124 case BTRFS_IOC_TRANS_END:
1125 return btrfs_ioctl_trans_end(file);
1126 case BTRFS_IOC_SYNC:
1127 btrfs_sync_fs(file->f_dentry->d_sb, 1);
1128 return 0;
1129 }
1130
1131 return -ENOTTY;
1132}
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
new file mode 100644
index 000000000000..b320b103fa13
--- /dev/null
+++ b/fs/btrfs/ioctl.h
@@ -0,0 +1,69 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __IOCTL_
20#define __IOCTL_
21#include <linux/ioctl.h>
22
23#define BTRFS_IOCTL_MAGIC 0x94
24#define BTRFS_VOL_NAME_MAX 255
25#define BTRFS_PATH_NAME_MAX 4087
26
27/* this should be 4k */
28struct btrfs_ioctl_vol_args {
29 __s64 fd;
30 char name[BTRFS_PATH_NAME_MAX + 1];
31};
32
33struct btrfs_ioctl_clone_range_args {
34 __s64 src_fd;
35 __u64 src_offset, src_length;
36 __u64 dest_offset;
37};
38
39#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
40 struct btrfs_ioctl_vol_args)
41#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
42 struct btrfs_ioctl_vol_args)
43#define BTRFS_IOC_RESIZE _IOW(BTRFS_IOCTL_MAGIC, 3, \
44 struct btrfs_ioctl_vol_args)
45#define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \
46 struct btrfs_ioctl_vol_args)
47/* trans start and trans end are dangerous, and only for
48 * use by applications that know how to avoid the
49 * resulting deadlocks
50 */
51#define BTRFS_IOC_TRANS_START _IO(BTRFS_IOCTL_MAGIC, 6)
52#define BTRFS_IOC_TRANS_END _IO(BTRFS_IOCTL_MAGIC, 7)
53#define BTRFS_IOC_SYNC _IO(BTRFS_IOCTL_MAGIC, 8)
54
55#define BTRFS_IOC_CLONE _IOW(BTRFS_IOCTL_MAGIC, 9, int)
56#define BTRFS_IOC_ADD_DEV _IOW(BTRFS_IOCTL_MAGIC, 10, \
57 struct btrfs_ioctl_vol_args)
58#define BTRFS_IOC_RM_DEV _IOW(BTRFS_IOCTL_MAGIC, 11, \
59 struct btrfs_ioctl_vol_args)
60#define BTRFS_IOC_BALANCE _IOW(BTRFS_IOCTL_MAGIC, 12, \
61 struct btrfs_ioctl_vol_args)
62
63#define BTRFS_IOC_CLONE_RANGE _IOW(BTRFS_IOCTL_MAGIC, 13, \
64 struct btrfs_ioctl_clone_range_args)
65
66#define BTRFS_IOC_SUBVOL_CREATE _IOW(BTRFS_IOCTL_MAGIC, 14, \
67 struct btrfs_ioctl_vol_args)
68
69#endif
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
new file mode 100644
index 000000000000..39bae7761db6
--- /dev/null
+++ b/fs/btrfs/locking.c
@@ -0,0 +1,88 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18#include <linux/sched.h>
19#include <linux/gfp.h>
20#include <linux/pagemap.h>
21#include <linux/spinlock.h>
22#include <linux/page-flags.h>
23#include <asm/bug.h>
24#include "ctree.h"
25#include "extent_io.h"
26#include "locking.h"
27
28/*
29 * locks the per buffer mutex in an extent buffer. This uses adaptive locks
30 * and the spin is not tuned very extensively. The spinning does make a big
31 * difference in almost every workload, but spinning for the right amount of
32 * time needs some help.
33 *
34 * In general, we want to spin as long as the lock holder is doing btree
35 * searches, and we should give up if they are in more expensive code.
36 */
37
38int btrfs_tree_lock(struct extent_buffer *eb)
39{
40 int i;
41
42 if (mutex_trylock(&eb->mutex))
43 return 0;
44 for (i = 0; i < 512; i++) {
45 cpu_relax();
46 if (mutex_trylock(&eb->mutex))
47 return 0;
48 }
49 cpu_relax();
50 mutex_lock_nested(&eb->mutex, BTRFS_MAX_LEVEL - btrfs_header_level(eb));
51 return 0;
52}
53
54int btrfs_try_tree_lock(struct extent_buffer *eb)
55{
56 return mutex_trylock(&eb->mutex);
57}
58
59int btrfs_tree_unlock(struct extent_buffer *eb)
60{
61 mutex_unlock(&eb->mutex);
62 return 0;
63}
64
65int btrfs_tree_locked(struct extent_buffer *eb)
66{
67 return mutex_is_locked(&eb->mutex);
68}
69
70/*
71 * btrfs_search_slot uses this to decide if it should drop its locks
72 * before doing something expensive like allocating free blocks for cow.
73 */
74int btrfs_path_lock_waiting(struct btrfs_path *path, int level)
75{
76 int i;
77 struct extent_buffer *eb;
78 for (i = level; i <= level + 1 && i < BTRFS_MAX_LEVEL; i++) {
79 eb = path->nodes[i];
80 if (!eb)
81 break;
82 smp_mb();
83 if (!list_empty(&eb->mutex.wait_list))
84 return 1;
85 }
86 return 0;
87}
88
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
new file mode 100644
index 000000000000..bc1faef12519
--- /dev/null
+++ b/fs/btrfs/locking.h
@@ -0,0 +1,27 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_LOCKING_
20#define __BTRFS_LOCKING_
21
22int btrfs_tree_lock(struct extent_buffer *eb);
23int btrfs_tree_unlock(struct extent_buffer *eb);
24int btrfs_tree_locked(struct extent_buffer *eb);
25int btrfs_try_tree_lock(struct extent_buffer *eb);
26int btrfs_path_lock_waiting(struct btrfs_path *path, int level);
27#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
new file mode 100644
index 000000000000..a20940170274
--- /dev/null
+++ b/fs/btrfs/ordered-data.c
@@ -0,0 +1,730 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/gfp.h>
20#include <linux/slab.h>
21#include <linux/blkdev.h>
22#include <linux/writeback.h>
23#include <linux/pagevec.h>
24#include "ctree.h"
25#include "transaction.h"
26#include "btrfs_inode.h"
27#include "extent_io.h"
28
29static u64 entry_end(struct btrfs_ordered_extent *entry)
30{
31 if (entry->file_offset + entry->len < entry->file_offset)
32 return (u64)-1;
33 return entry->file_offset + entry->len;
34}
35
36/* returns NULL if the insertion worked, or it returns the node it did find
37 * in the tree
38 */
39static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
40 struct rb_node *node)
41{
42 struct rb_node **p = &root->rb_node;
43 struct rb_node *parent = NULL;
44 struct btrfs_ordered_extent *entry;
45
46 while (*p) {
47 parent = *p;
48 entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node);
49
50 if (file_offset < entry->file_offset)
51 p = &(*p)->rb_left;
52 else if (file_offset >= entry_end(entry))
53 p = &(*p)->rb_right;
54 else
55 return parent;
56 }
57
58 rb_link_node(node, parent, p);
59 rb_insert_color(node, root);
60 return NULL;
61}
62
63/*
64 * look for a given offset in the tree, and if it can't be found return the
65 * first lesser offset
66 */
67static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
68 struct rb_node **prev_ret)
69{
70 struct rb_node *n = root->rb_node;
71 struct rb_node *prev = NULL;
72 struct rb_node *test;
73 struct btrfs_ordered_extent *entry;
74 struct btrfs_ordered_extent *prev_entry = NULL;
75
76 while (n) {
77 entry = rb_entry(n, struct btrfs_ordered_extent, rb_node);
78 prev = n;
79 prev_entry = entry;
80
81 if (file_offset < entry->file_offset)
82 n = n->rb_left;
83 else if (file_offset >= entry_end(entry))
84 n = n->rb_right;
85 else
86 return n;
87 }
88 if (!prev_ret)
89 return NULL;
90
91 while (prev && file_offset >= entry_end(prev_entry)) {
92 test = rb_next(prev);
93 if (!test)
94 break;
95 prev_entry = rb_entry(test, struct btrfs_ordered_extent,
96 rb_node);
97 if (file_offset < entry_end(prev_entry))
98 break;
99
100 prev = test;
101 }
102 if (prev)
103 prev_entry = rb_entry(prev, struct btrfs_ordered_extent,
104 rb_node);
105 while (prev && file_offset < entry_end(prev_entry)) {
106 test = rb_prev(prev);
107 if (!test)
108 break;
109 prev_entry = rb_entry(test, struct btrfs_ordered_extent,
110 rb_node);
111 prev = test;
112 }
113 *prev_ret = prev;
114 return NULL;
115}
116
117/*
118 * helper to check if a given offset is inside a given entry
119 */
120static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
121{
122 if (file_offset < entry->file_offset ||
123 entry->file_offset + entry->len <= file_offset)
124 return 0;
125 return 1;
126}
127
128/*
129 * look find the first ordered struct that has this offset, otherwise
130 * the first one less than this offset
131 */
132static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
133 u64 file_offset)
134{
135 struct rb_root *root = &tree->tree;
136 struct rb_node *prev;
137 struct rb_node *ret;
138 struct btrfs_ordered_extent *entry;
139
140 if (tree->last) {
141 entry = rb_entry(tree->last, struct btrfs_ordered_extent,
142 rb_node);
143 if (offset_in_entry(entry, file_offset))
144 return tree->last;
145 }
146 ret = __tree_search(root, file_offset, &prev);
147 if (!ret)
148 ret = prev;
149 if (ret)
150 tree->last = ret;
151 return ret;
152}
153
154/* allocate and add a new ordered_extent into the per-inode tree.
155 * file_offset is the logical offset in the file
156 *
157 * start is the disk block number of an extent already reserved in the
158 * extent allocation tree
159 *
160 * len is the length of the extent
161 *
162 * This also sets the EXTENT_ORDERED bit on the range in the inode.
163 *
164 * The tree is given a single reference on the ordered extent that was
165 * inserted.
166 */
167int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
168 u64 start, u64 len, u64 disk_len, int type)
169{
170 struct btrfs_ordered_inode_tree *tree;
171 struct rb_node *node;
172 struct btrfs_ordered_extent *entry;
173
174 tree = &BTRFS_I(inode)->ordered_tree;
175 entry = kzalloc(sizeof(*entry), GFP_NOFS);
176 if (!entry)
177 return -ENOMEM;
178
179 mutex_lock(&tree->mutex);
180 entry->file_offset = file_offset;
181 entry->start = start;
182 entry->len = len;
183 entry->disk_len = disk_len;
184 entry->inode = inode;
185 if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
186 set_bit(type, &entry->flags);
187
188 /* one ref for the tree */
189 atomic_set(&entry->refs, 1);
190 init_waitqueue_head(&entry->wait);
191 INIT_LIST_HEAD(&entry->list);
192 INIT_LIST_HEAD(&entry->root_extent_list);
193
194 node = tree_insert(&tree->tree, file_offset,
195 &entry->rb_node);
196 BUG_ON(node);
197
198 set_extent_ordered(&BTRFS_I(inode)->io_tree, file_offset,
199 entry_end(entry) - 1, GFP_NOFS);
200
201 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
202 list_add_tail(&entry->root_extent_list,
203 &BTRFS_I(inode)->root->fs_info->ordered_extents);
204 spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
205
206 mutex_unlock(&tree->mutex);
207 BUG_ON(node);
208 return 0;
209}
210
211/*
212 * Add a struct btrfs_ordered_sum into the list of checksums to be inserted
213 * when an ordered extent is finished. If the list covers more than one
214 * ordered extent, it is split across multiples.
215 */
216int btrfs_add_ordered_sum(struct inode *inode,
217 struct btrfs_ordered_extent *entry,
218 struct btrfs_ordered_sum *sum)
219{
220 struct btrfs_ordered_inode_tree *tree;
221
222 tree = &BTRFS_I(inode)->ordered_tree;
223 mutex_lock(&tree->mutex);
224 list_add_tail(&sum->list, &entry->list);
225 mutex_unlock(&tree->mutex);
226 return 0;
227}
228
229/*
230 * this is used to account for finished IO across a given range
231 * of the file. The IO should not span ordered extents. If
232 * a given ordered_extent is completely done, 1 is returned, otherwise
233 * 0.
234 *
235 * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used
236 * to make sure this function only returns 1 once for a given ordered extent.
237 */
238int btrfs_dec_test_ordered_pending(struct inode *inode,
239 u64 file_offset, u64 io_size)
240{
241 struct btrfs_ordered_inode_tree *tree;
242 struct rb_node *node;
243 struct btrfs_ordered_extent *entry;
244 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
245 int ret;
246
247 tree = &BTRFS_I(inode)->ordered_tree;
248 mutex_lock(&tree->mutex);
249 clear_extent_ordered(io_tree, file_offset, file_offset + io_size - 1,
250 GFP_NOFS);
251 node = tree_search(tree, file_offset);
252 if (!node) {
253 ret = 1;
254 goto out;
255 }
256
257 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
258 if (!offset_in_entry(entry, file_offset)) {
259 ret = 1;
260 goto out;
261 }
262
263 ret = test_range_bit(io_tree, entry->file_offset,
264 entry->file_offset + entry->len - 1,
265 EXTENT_ORDERED, 0);
266 if (ret == 0)
267 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
268out:
269 mutex_unlock(&tree->mutex);
270 return ret == 0;
271}
272
273/*
274 * used to drop a reference on an ordered extent. This will free
275 * the extent if the last reference is dropped
276 */
277int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
278{
279 struct list_head *cur;
280 struct btrfs_ordered_sum *sum;
281
282 if (atomic_dec_and_test(&entry->refs)) {
283 while (!list_empty(&entry->list)) {
284 cur = entry->list.next;
285 sum = list_entry(cur, struct btrfs_ordered_sum, list);
286 list_del(&sum->list);
287 kfree(sum);
288 }
289 kfree(entry);
290 }
291 return 0;
292}
293
294/*
295 * remove an ordered extent from the tree. No references are dropped
296 * but, anyone waiting on this extent is woken up.
297 */
298int btrfs_remove_ordered_extent(struct inode *inode,
299 struct btrfs_ordered_extent *entry)
300{
301 struct btrfs_ordered_inode_tree *tree;
302 struct rb_node *node;
303
304 tree = &BTRFS_I(inode)->ordered_tree;
305 mutex_lock(&tree->mutex);
306 node = &entry->rb_node;
307 rb_erase(node, &tree->tree);
308 tree->last = NULL;
309 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
310
311 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
312 list_del_init(&entry->root_extent_list);
313 spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
314
315 mutex_unlock(&tree->mutex);
316 wake_up(&entry->wait);
317 return 0;
318}
319
320/*
321 * wait for all the ordered extents in a root. This is done when balancing
322 * space between drives.
323 */
324int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only)
325{
326 struct list_head splice;
327 struct list_head *cur;
328 struct btrfs_ordered_extent *ordered;
329 struct inode *inode;
330
331 INIT_LIST_HEAD(&splice);
332
333 spin_lock(&root->fs_info->ordered_extent_lock);
334 list_splice_init(&root->fs_info->ordered_extents, &splice);
335 while (!list_empty(&splice)) {
336 cur = splice.next;
337 ordered = list_entry(cur, struct btrfs_ordered_extent,
338 root_extent_list);
339 if (nocow_only &&
340 !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) &&
341 !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
342 list_move(&ordered->root_extent_list,
343 &root->fs_info->ordered_extents);
344 cond_resched_lock(&root->fs_info->ordered_extent_lock);
345 continue;
346 }
347
348 list_del_init(&ordered->root_extent_list);
349 atomic_inc(&ordered->refs);
350
351 /*
352 * the inode may be getting freed (in sys_unlink path).
353 */
354 inode = igrab(ordered->inode);
355
356 spin_unlock(&root->fs_info->ordered_extent_lock);
357
358 if (inode) {
359 btrfs_start_ordered_extent(inode, ordered, 1);
360 btrfs_put_ordered_extent(ordered);
361 iput(inode);
362 } else {
363 btrfs_put_ordered_extent(ordered);
364 }
365
366 spin_lock(&root->fs_info->ordered_extent_lock);
367 }
368 spin_unlock(&root->fs_info->ordered_extent_lock);
369 return 0;
370}
371
372/*
373 * Used to start IO or wait for a given ordered extent to finish.
374 *
375 * If wait is one, this effectively waits on page writeback for all the pages
376 * in the extent, and it waits on the io completion code to insert
377 * metadata into the btree corresponding to the extent
378 */
379void btrfs_start_ordered_extent(struct inode *inode,
380 struct btrfs_ordered_extent *entry,
381 int wait)
382{
383 u64 start = entry->file_offset;
384 u64 end = start + entry->len - 1;
385
386 /*
387 * pages in the range can be dirty, clean or writeback. We
388 * start IO on any dirty ones so the wait doesn't stall waiting
389 * for pdflush to find them
390 */
391 btrfs_fdatawrite_range(inode->i_mapping, start, end, WB_SYNC_ALL);
392 if (wait) {
393 wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
394 &entry->flags));
395 }
396}
397
398/*
399 * Used to wait on ordered extents across a large range of bytes.
400 */
401int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
402{
403 u64 end;
404 u64 orig_end;
405 u64 wait_end;
406 struct btrfs_ordered_extent *ordered;
407
408 if (start + len < start) {
409 orig_end = INT_LIMIT(loff_t);
410 } else {
411 orig_end = start + len - 1;
412 if (orig_end > INT_LIMIT(loff_t))
413 orig_end = INT_LIMIT(loff_t);
414 }
415 wait_end = orig_end;
416again:
417 /* start IO across the range first to instantiate any delalloc
418 * extents
419 */
420 btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_NONE);
421
422 /* The compression code will leave pages locked but return from
423 * writepage without setting the page writeback. Starting again
424 * with WB_SYNC_ALL will end up waiting for the IO to actually start.
425 */
426 btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL);
427
428 btrfs_wait_on_page_writeback_range(inode->i_mapping,
429 start >> PAGE_CACHE_SHIFT,
430 orig_end >> PAGE_CACHE_SHIFT);
431
432 end = orig_end;
433 while (1) {
434 ordered = btrfs_lookup_first_ordered_extent(inode, end);
435 if (!ordered)
436 break;
437 if (ordered->file_offset > orig_end) {
438 btrfs_put_ordered_extent(ordered);
439 break;
440 }
441 if (ordered->file_offset + ordered->len < start) {
442 btrfs_put_ordered_extent(ordered);
443 break;
444 }
445 btrfs_start_ordered_extent(inode, ordered, 1);
446 end = ordered->file_offset;
447 btrfs_put_ordered_extent(ordered);
448 if (end == 0 || end == start)
449 break;
450 end--;
451 }
452 if (test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end,
453 EXTENT_ORDERED | EXTENT_DELALLOC, 0)) {
454 schedule_timeout(1);
455 goto again;
456 }
457 return 0;
458}
459
460/*
461 * find an ordered extent corresponding to file_offset. return NULL if
462 * nothing is found, otherwise take a reference on the extent and return it
463 */
464struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
465 u64 file_offset)
466{
467 struct btrfs_ordered_inode_tree *tree;
468 struct rb_node *node;
469 struct btrfs_ordered_extent *entry = NULL;
470
471 tree = &BTRFS_I(inode)->ordered_tree;
472 mutex_lock(&tree->mutex);
473 node = tree_search(tree, file_offset);
474 if (!node)
475 goto out;
476
477 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
478 if (!offset_in_entry(entry, file_offset))
479 entry = NULL;
480 if (entry)
481 atomic_inc(&entry->refs);
482out:
483 mutex_unlock(&tree->mutex);
484 return entry;
485}
486
487/*
488 * lookup and return any extent before 'file_offset'. NULL is returned
489 * if none is found
490 */
491struct btrfs_ordered_extent *
492btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
493{
494 struct btrfs_ordered_inode_tree *tree;
495 struct rb_node *node;
496 struct btrfs_ordered_extent *entry = NULL;
497
498 tree = &BTRFS_I(inode)->ordered_tree;
499 mutex_lock(&tree->mutex);
500 node = tree_search(tree, file_offset);
501 if (!node)
502 goto out;
503
504 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
505 atomic_inc(&entry->refs);
506out:
507 mutex_unlock(&tree->mutex);
508 return entry;
509}
510
511/*
512 * After an extent is done, call this to conditionally update the on disk
513 * i_size. i_size is updated to cover any fully written part of the file.
514 */
515int btrfs_ordered_update_i_size(struct inode *inode,
516 struct btrfs_ordered_extent *ordered)
517{
518 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
519 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
520 u64 disk_i_size;
521 u64 new_i_size;
522 u64 i_size_test;
523 struct rb_node *node;
524 struct btrfs_ordered_extent *test;
525
526 mutex_lock(&tree->mutex);
527 disk_i_size = BTRFS_I(inode)->disk_i_size;
528
529 /*
530 * if the disk i_size is already at the inode->i_size, or
531 * this ordered extent is inside the disk i_size, we're done
532 */
533 if (disk_i_size >= inode->i_size ||
534 ordered->file_offset + ordered->len <= disk_i_size) {
535 goto out;
536 }
537
538 /*
539 * we can't update the disk_isize if there are delalloc bytes
540 * between disk_i_size and this ordered extent
541 */
542 if (test_range_bit(io_tree, disk_i_size,
543 ordered->file_offset + ordered->len - 1,
544 EXTENT_DELALLOC, 0)) {
545 goto out;
546 }
547 /*
548 * walk backward from this ordered extent to disk_i_size.
549 * if we find an ordered extent then we can't update disk i_size
550 * yet
551 */
552 node = &ordered->rb_node;
553 while (1) {
554 node = rb_prev(node);
555 if (!node)
556 break;
557 test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
558 if (test->file_offset + test->len <= disk_i_size)
559 break;
560 if (test->file_offset >= inode->i_size)
561 break;
562 if (test->file_offset >= disk_i_size)
563 goto out;
564 }
565 new_i_size = min_t(u64, entry_end(ordered), i_size_read(inode));
566
567 /*
568 * at this point, we know we can safely update i_size to at least
569 * the offset from this ordered extent. But, we need to
570 * walk forward and see if ios from higher up in the file have
571 * finished.
572 */
573 node = rb_next(&ordered->rb_node);
574 i_size_test = 0;
575 if (node) {
576 /*
577 * do we have an area where IO might have finished
578 * between our ordered extent and the next one.
579 */
580 test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
581 if (test->file_offset > entry_end(ordered))
582 i_size_test = test->file_offset;
583 } else {
584 i_size_test = i_size_read(inode);
585 }
586
587 /*
588 * i_size_test is the end of a region after this ordered
589 * extent where there are no ordered extents. As long as there
590 * are no delalloc bytes in this area, it is safe to update
591 * disk_i_size to the end of the region.
592 */
593 if (i_size_test > entry_end(ordered) &&
594 !test_range_bit(io_tree, entry_end(ordered), i_size_test - 1,
595 EXTENT_DELALLOC, 0)) {
596 new_i_size = min_t(u64, i_size_test, i_size_read(inode));
597 }
598 BTRFS_I(inode)->disk_i_size = new_i_size;
599out:
600 mutex_unlock(&tree->mutex);
601 return 0;
602}
603
604/*
605 * search the ordered extents for one corresponding to 'offset' and
606 * try to find a checksum. This is used because we allow pages to
607 * be reclaimed before their checksum is actually put into the btree
608 */
609int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
610 u32 *sum)
611{
612 struct btrfs_ordered_sum *ordered_sum;
613 struct btrfs_sector_sum *sector_sums;
614 struct btrfs_ordered_extent *ordered;
615 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
616 struct list_head *cur;
617 unsigned long num_sectors;
618 unsigned long i;
619 u32 sectorsize = BTRFS_I(inode)->root->sectorsize;
620 int ret = 1;
621
622 ordered = btrfs_lookup_ordered_extent(inode, offset);
623 if (!ordered)
624 return 1;
625
626 mutex_lock(&tree->mutex);
627 list_for_each_prev(cur, &ordered->list) {
628 ordered_sum = list_entry(cur, struct btrfs_ordered_sum, list);
629 if (disk_bytenr >= ordered_sum->bytenr) {
630 num_sectors = ordered_sum->len / sectorsize;
631 sector_sums = ordered_sum->sums;
632 for (i = 0; i < num_sectors; i++) {
633 if (sector_sums[i].bytenr == disk_bytenr) {
634 *sum = sector_sums[i].sum;
635 ret = 0;
636 goto out;
637 }
638 }
639 }
640 }
641out:
642 mutex_unlock(&tree->mutex);
643 btrfs_put_ordered_extent(ordered);
644 return ret;
645}
646
647
648/**
649 * taken from mm/filemap.c because it isn't exported
650 *
651 * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
652 * @mapping: address space structure to write
653 * @start: offset in bytes where the range starts
654 * @end: offset in bytes where the range ends (inclusive)
655 * @sync_mode: enable synchronous operation
656 *
657 * Start writeback against all of a mapping's dirty pages that lie
658 * within the byte offsets <start, end> inclusive.
659 *
660 * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
661 * opposed to a regular memory cleansing writeback. The difference between
662 * these two operations is that if a dirty page/buffer is encountered, it must
663 * be waited upon, and not just skipped over.
664 */
665int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
666 loff_t end, int sync_mode)
667{
668 struct writeback_control wbc = {
669 .sync_mode = sync_mode,
670 .nr_to_write = mapping->nrpages * 2,
671 .range_start = start,
672 .range_end = end,
673 .for_writepages = 1,
674 };
675 return btrfs_writepages(mapping, &wbc);
676}
677
678/**
679 * taken from mm/filemap.c because it isn't exported
680 *
681 * wait_on_page_writeback_range - wait for writeback to complete
682 * @mapping: target address_space
683 * @start: beginning page index
684 * @end: ending page index
685 *
686 * Wait for writeback to complete against pages indexed by start->end
687 * inclusive
688 */
689int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
690 pgoff_t start, pgoff_t end)
691{
692 struct pagevec pvec;
693 int nr_pages;
694 int ret = 0;
695 pgoff_t index;
696
697 if (end < start)
698 return 0;
699
700 pagevec_init(&pvec, 0);
701 index = start;
702 while ((index <= end) &&
703 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
704 PAGECACHE_TAG_WRITEBACK,
705 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
706 unsigned i;
707
708 for (i = 0; i < nr_pages; i++) {
709 struct page *page = pvec.pages[i];
710
711 /* until radix tree lookup accepts end_index */
712 if (page->index > end)
713 continue;
714
715 wait_on_page_writeback(page);
716 if (PageError(page))
717 ret = -EIO;
718 }
719 pagevec_release(&pvec);
720 cond_resched();
721 }
722
723 /* Check for outstanding write errors */
724 if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
725 ret = -ENOSPC;
726 if (test_and_clear_bit(AS_EIO, &mapping->flags))
727 ret = -EIO;
728
729 return ret;
730}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
new file mode 100644
index 000000000000..ab66d5e8d6d6
--- /dev/null
+++ b/fs/btrfs/ordered-data.h
@@ -0,0 +1,158 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_ORDERED_DATA__
20#define __BTRFS_ORDERED_DATA__
21
22/* one of these per inode */
23struct btrfs_ordered_inode_tree {
24 struct mutex mutex;
25 struct rb_root tree;
26 struct rb_node *last;
27};
28
29/*
30 * these are used to collect checksums done just before bios submission.
31 * They are attached via a list into the ordered extent, and
32 * checksum items are inserted into the tree after all the blocks in
33 * the ordered extent are on disk
34 */
35struct btrfs_sector_sum {
36 /* bytenr on disk */
37 u64 bytenr;
38 u32 sum;
39};
40
41struct btrfs_ordered_sum {
42 /* bytenr is the start of this extent on disk */
43 u64 bytenr;
44
45 /*
46 * this is the length in bytes covered by the sums array below.
47 */
48 unsigned long len;
49 struct list_head list;
50 /* last field is a variable length array of btrfs_sector_sums */
51 struct btrfs_sector_sum sums[];
52};
53
54/*
55 * bits for the flags field:
56 *
57 * BTRFS_ORDERED_IO_DONE is set when all of the blocks are written.
58 * It is used to make sure metadata is inserted into the tree only once
59 * per extent.
60 *
61 * BTRFS_ORDERED_COMPLETE is set when the extent is removed from the
62 * rbtree, just before waking any waiters. It is used to indicate the
63 * IO is done and any metadata is inserted into the tree.
64 */
65#define BTRFS_ORDERED_IO_DONE 0 /* set when all the pages are written */
66
67#define BTRFS_ORDERED_COMPLETE 1 /* set when removed from the tree */
68
69#define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */
70
71#define BTRFS_ORDERED_COMPRESSED 3 /* writing a compressed extent */
72
73#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */
74
75struct btrfs_ordered_extent {
76 /* logical offset in the file */
77 u64 file_offset;
78
79 /* disk byte number */
80 u64 start;
81
82 /* ram length of the extent in bytes */
83 u64 len;
84
85 /* extent length on disk */
86 u64 disk_len;
87
88 /* flags (described above) */
89 unsigned long flags;
90
91 /* reference count */
92 atomic_t refs;
93
94 /* the inode we belong to */
95 struct inode *inode;
96
97 /* list of checksums for insertion when the extent io is done */
98 struct list_head list;
99
100 /* used to wait for the BTRFS_ORDERED_COMPLETE bit */
101 wait_queue_head_t wait;
102
103 /* our friendly rbtree entry */
104 struct rb_node rb_node;
105
106 /* a per root list of all the pending ordered extents */
107 struct list_head root_extent_list;
108};
109
110
111/*
112 * calculates the total size you need to allocate for an ordered sum
113 * structure spanning 'bytes' in the file
114 */
115static inline int btrfs_ordered_sum_size(struct btrfs_root *root,
116 unsigned long bytes)
117{
118 unsigned long num_sectors = (bytes + root->sectorsize - 1) /
119 root->sectorsize;
120 num_sectors++;
121 return sizeof(struct btrfs_ordered_sum) +
122 num_sectors * sizeof(struct btrfs_sector_sum);
123}
124
125static inline void
126btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
127{
128 mutex_init(&t->mutex);
129 t->tree.rb_node = NULL;
130 t->last = NULL;
131}
132
133int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
134int btrfs_remove_ordered_extent(struct inode *inode,
135 struct btrfs_ordered_extent *entry);
136int btrfs_dec_test_ordered_pending(struct inode *inode,
137 u64 file_offset, u64 io_size);
138int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
139 u64 start, u64 len, u64 disk_len, int tyep);
140int btrfs_add_ordered_sum(struct inode *inode,
141 struct btrfs_ordered_extent *entry,
142 struct btrfs_ordered_sum *sum);
143struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
144 u64 file_offset);
145void btrfs_start_ordered_extent(struct inode *inode,
146 struct btrfs_ordered_extent *entry, int wait);
147int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
148struct btrfs_ordered_extent *
149btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
150int btrfs_ordered_update_i_size(struct inode *inode,
151 struct btrfs_ordered_extent *ordered);
152int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
153int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
154 pgoff_t start, pgoff_t end);
155int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
156 loff_t end, int sync_mode);
157int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only);
158#endif
diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c
new file mode 100644
index 000000000000..3c0d52af4f80
--- /dev/null
+++ b/fs/btrfs/orphan.c
@@ -0,0 +1,67 @@
1/*
2 * Copyright (C) 2008 Red Hat. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include "ctree.h"
20#include "disk-io.h"
21
22int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
23 struct btrfs_root *root, u64 offset)
24{
25 struct btrfs_path *path;
26 struct btrfs_key key;
27 int ret = 0;
28
29 key.objectid = BTRFS_ORPHAN_OBJECTID;
30 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
31 key.offset = offset;
32
33 path = btrfs_alloc_path();
34 if (!path)
35 return -ENOMEM;
36
37 ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
38
39 btrfs_free_path(path);
40 return ret;
41}
42
43int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
44 struct btrfs_root *root, u64 offset)
45{
46 struct btrfs_path *path;
47 struct btrfs_key key;
48 int ret = 0;
49
50 key.objectid = BTRFS_ORPHAN_OBJECTID;
51 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
52 key.offset = offset;
53
54 path = btrfs_alloc_path();
55 if (!path)
56 return -ENOMEM;
57
58 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
59 if (ret)
60 goto out;
61
62 ret = btrfs_del_item(trans, root, path);
63
64out:
65 btrfs_free_path(path);
66 return ret;
67}
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
new file mode 100644
index 000000000000..5f8f218c1005
--- /dev/null
+++ b/fs/btrfs/print-tree.c
@@ -0,0 +1,216 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include "ctree.h"
20#include "disk-io.h"
21#include "print-tree.h"
22
23static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk)
24{
25 int num_stripes = btrfs_chunk_num_stripes(eb, chunk);
26 int i;
27 printk(KERN_INFO "\t\tchunk length %llu owner %llu type %llu "
28 "num_stripes %d\n",
29 (unsigned long long)btrfs_chunk_length(eb, chunk),
30 (unsigned long long)btrfs_chunk_owner(eb, chunk),
31 (unsigned long long)btrfs_chunk_type(eb, chunk),
32 num_stripes);
33 for (i = 0 ; i < num_stripes ; i++) {
34 printk(KERN_INFO "\t\t\tstripe %d devid %llu offset %llu\n", i,
35 (unsigned long long)btrfs_stripe_devid_nr(eb, chunk, i),
36 (unsigned long long)btrfs_stripe_offset_nr(eb, chunk, i));
37 }
38}
39static void print_dev_item(struct extent_buffer *eb,
40 struct btrfs_dev_item *dev_item)
41{
42 printk(KERN_INFO "\t\tdev item devid %llu "
43 "total_bytes %llu bytes used %llu\n",
44 (unsigned long long)btrfs_device_id(eb, dev_item),
45 (unsigned long long)btrfs_device_total_bytes(eb, dev_item),
46 (unsigned long long)btrfs_device_bytes_used(eb, dev_item));
47}
48void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
49{
50 int i;
51 u32 nr = btrfs_header_nritems(l);
52 struct btrfs_item *item;
53 struct btrfs_extent_item *ei;
54 struct btrfs_root_item *ri;
55 struct btrfs_dir_item *di;
56 struct btrfs_inode_item *ii;
57 struct btrfs_block_group_item *bi;
58 struct btrfs_file_extent_item *fi;
59 struct btrfs_key key;
60 struct btrfs_key found_key;
61 struct btrfs_extent_ref *ref;
62 struct btrfs_dev_extent *dev_extent;
63 u32 type;
64
65 printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n",
66 (unsigned long long)btrfs_header_bytenr(l), nr,
67 btrfs_leaf_free_space(root, l));
68 for (i = 0 ; i < nr ; i++) {
69 item = btrfs_item_nr(l, i);
70 btrfs_item_key_to_cpu(l, &key, i);
71 type = btrfs_key_type(&key);
72 printk(KERN_INFO "\titem %d key (%llu %x %llu) itemoff %d "
73 "itemsize %d\n",
74 i,
75 (unsigned long long)key.objectid, type,
76 (unsigned long long)key.offset,
77 btrfs_item_offset(l, item), btrfs_item_size(l, item));
78 switch (type) {
79 case BTRFS_INODE_ITEM_KEY:
80 ii = btrfs_item_ptr(l, i, struct btrfs_inode_item);
81 printk(KERN_INFO "\t\tinode generation %llu size %llu "
82 "mode %o\n",
83 (unsigned long long)
84 btrfs_inode_generation(l, ii),
85 (unsigned long long)btrfs_inode_size(l, ii),
86 btrfs_inode_mode(l, ii));
87 break;
88 case BTRFS_DIR_ITEM_KEY:
89 di = btrfs_item_ptr(l, i, struct btrfs_dir_item);
90 btrfs_dir_item_key_to_cpu(l, di, &found_key);
91 printk(KERN_INFO "\t\tdir oid %llu type %u\n",
92 (unsigned long long)found_key.objectid,
93 btrfs_dir_type(l, di));
94 break;
95 case BTRFS_ROOT_ITEM_KEY:
96 ri = btrfs_item_ptr(l, i, struct btrfs_root_item);
97 printk(KERN_INFO "\t\troot data bytenr %llu refs %u\n",
98 (unsigned long long)
99 btrfs_disk_root_bytenr(l, ri),
100 btrfs_disk_root_refs(l, ri));
101 break;
102 case BTRFS_EXTENT_ITEM_KEY:
103 ei = btrfs_item_ptr(l, i, struct btrfs_extent_item);
104 printk(KERN_INFO "\t\textent data refs %u\n",
105 btrfs_extent_refs(l, ei));
106 break;
107 case BTRFS_EXTENT_REF_KEY:
108 ref = btrfs_item_ptr(l, i, struct btrfs_extent_ref);
109 printk(KERN_INFO "\t\textent back ref root %llu "
110 "gen %llu owner %llu num_refs %lu\n",
111 (unsigned long long)btrfs_ref_root(l, ref),
112 (unsigned long long)btrfs_ref_generation(l, ref),
113 (unsigned long long)btrfs_ref_objectid(l, ref),
114 (unsigned long)btrfs_ref_num_refs(l, ref));
115 break;
116
117 case BTRFS_EXTENT_DATA_KEY:
118 fi = btrfs_item_ptr(l, i,
119 struct btrfs_file_extent_item);
120 if (btrfs_file_extent_type(l, fi) ==
121 BTRFS_FILE_EXTENT_INLINE) {
122 printk(KERN_INFO "\t\tinline extent data "
123 "size %u\n",
124 btrfs_file_extent_inline_len(l, fi));
125 break;
126 }
127 printk(KERN_INFO "\t\textent data disk bytenr %llu "
128 "nr %llu\n",
129 (unsigned long long)
130 btrfs_file_extent_disk_bytenr(l, fi),
131 (unsigned long long)
132 btrfs_file_extent_disk_num_bytes(l, fi));
133 printk(KERN_INFO "\t\textent data offset %llu "
134 "nr %llu ram %llu\n",
135 (unsigned long long)
136 btrfs_file_extent_offset(l, fi),
137 (unsigned long long)
138 btrfs_file_extent_num_bytes(l, fi),
139 (unsigned long long)
140 btrfs_file_extent_ram_bytes(l, fi));
141 break;
142 case BTRFS_BLOCK_GROUP_ITEM_KEY:
143 bi = btrfs_item_ptr(l, i,
144 struct btrfs_block_group_item);
145 printk(KERN_INFO "\t\tblock group used %llu\n",
146 (unsigned long long)
147 btrfs_disk_block_group_used(l, bi));
148 break;
149 case BTRFS_CHUNK_ITEM_KEY:
150 print_chunk(l, btrfs_item_ptr(l, i,
151 struct btrfs_chunk));
152 break;
153 case BTRFS_DEV_ITEM_KEY:
154 print_dev_item(l, btrfs_item_ptr(l, i,
155 struct btrfs_dev_item));
156 break;
157 case BTRFS_DEV_EXTENT_KEY:
158 dev_extent = btrfs_item_ptr(l, i,
159 struct btrfs_dev_extent);
160 printk(KERN_INFO "\t\tdev extent chunk_tree %llu\n"
161 "\t\tchunk objectid %llu chunk offset %llu "
162 "length %llu\n",
163 (unsigned long long)
164 btrfs_dev_extent_chunk_tree(l, dev_extent),
165 (unsigned long long)
166 btrfs_dev_extent_chunk_objectid(l, dev_extent),
167 (unsigned long long)
168 btrfs_dev_extent_chunk_offset(l, dev_extent),
169 (unsigned long long)
170 btrfs_dev_extent_length(l, dev_extent));
171 };
172 }
173}
174
175void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
176{
177 int i; u32 nr;
178 struct btrfs_key key;
179 int level;
180
181 if (!c)
182 return;
183 nr = btrfs_header_nritems(c);
184 level = btrfs_header_level(c);
185 if (level == 0) {
186 btrfs_print_leaf(root, c);
187 return;
188 }
189 printk(KERN_INFO "node %llu level %d total ptrs %d free spc %u\n",
190 (unsigned long long)btrfs_header_bytenr(c),
191 btrfs_header_level(c), nr,
192 (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr);
193 for (i = 0; i < nr; i++) {
194 btrfs_node_key_to_cpu(c, &key, i);
195 printk(KERN_INFO "\tkey %d (%llu %u %llu) block %llu\n",
196 i,
197 (unsigned long long)key.objectid,
198 key.type,
199 (unsigned long long)key.offset,
200 (unsigned long long)btrfs_node_blockptr(c, i));
201 }
202 for (i = 0; i < nr; i++) {
203 struct extent_buffer *next = read_tree_block(root,
204 btrfs_node_blockptr(c, i),
205 btrfs_level_size(root, level - 1),
206 btrfs_node_ptr_generation(c, i));
207 if (btrfs_is_leaf(next) &&
208 btrfs_header_level(c) != 1)
209 BUG();
210 if (btrfs_header_level(next) !=
211 btrfs_header_level(c) - 1)
212 BUG();
213 btrfs_print_tree(root, next);
214 free_extent_buffer(next);
215 }
216}
diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h
new file mode 100644
index 000000000000..da75efe534d5
--- /dev/null
+++ b/fs/btrfs/print-tree.h
@@ -0,0 +1,23 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __PRINT_TREE_
20#define __PRINT_TREE_
21void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l);
22void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *t);
23#endif
diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c
new file mode 100644
index 000000000000..6f0acc4c9eab
--- /dev/null
+++ b/fs/btrfs/ref-cache.c
@@ -0,0 +1,230 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include "ctree.h"
21#include "ref-cache.h"
22#include "transaction.h"
23
24/*
25 * leaf refs are used to cache the information about which extents
26 * a given leaf has references on. This allows us to process that leaf
27 * in btrfs_drop_snapshot without needing to read it back from disk.
28 */
29
30/*
31 * kmalloc a leaf reference struct and update the counters for the
32 * total ref cache size
33 */
34struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root,
35 int nr_extents)
36{
37 struct btrfs_leaf_ref *ref;
38 size_t size = btrfs_leaf_ref_size(nr_extents);
39
40 ref = kmalloc(size, GFP_NOFS);
41 if (ref) {
42 spin_lock(&root->fs_info->ref_cache_lock);
43 root->fs_info->total_ref_cache_size += size;
44 spin_unlock(&root->fs_info->ref_cache_lock);
45
46 memset(ref, 0, sizeof(*ref));
47 atomic_set(&ref->usage, 1);
48 INIT_LIST_HEAD(&ref->list);
49 }
50 return ref;
51}
52
53/*
54 * free a leaf reference struct and update the counters for the
55 * total ref cache size
56 */
57void btrfs_free_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
58{
59 if (!ref)
60 return;
61 WARN_ON(atomic_read(&ref->usage) == 0);
62 if (atomic_dec_and_test(&ref->usage)) {
63 size_t size = btrfs_leaf_ref_size(ref->nritems);
64
65 BUG_ON(ref->in_tree);
66 kfree(ref);
67
68 spin_lock(&root->fs_info->ref_cache_lock);
69 root->fs_info->total_ref_cache_size -= size;
70 spin_unlock(&root->fs_info->ref_cache_lock);
71 }
72}
73
74static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
75 struct rb_node *node)
76{
77 struct rb_node **p = &root->rb_node;
78 struct rb_node *parent = NULL;
79 struct btrfs_leaf_ref *entry;
80
81 while (*p) {
82 parent = *p;
83 entry = rb_entry(parent, struct btrfs_leaf_ref, rb_node);
84
85 if (bytenr < entry->bytenr)
86 p = &(*p)->rb_left;
87 else if (bytenr > entry->bytenr)
88 p = &(*p)->rb_right;
89 else
90 return parent;
91 }
92
93 entry = rb_entry(node, struct btrfs_leaf_ref, rb_node);
94 rb_link_node(node, parent, p);
95 rb_insert_color(node, root);
96 return NULL;
97}
98
99static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
100{
101 struct rb_node *n = root->rb_node;
102 struct btrfs_leaf_ref *entry;
103
104 while (n) {
105 entry = rb_entry(n, struct btrfs_leaf_ref, rb_node);
106 WARN_ON(!entry->in_tree);
107
108 if (bytenr < entry->bytenr)
109 n = n->rb_left;
110 else if (bytenr > entry->bytenr)
111 n = n->rb_right;
112 else
113 return n;
114 }
115 return NULL;
116}
117
118int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen,
119 int shared)
120{
121 struct btrfs_leaf_ref *ref = NULL;
122 struct btrfs_leaf_ref_tree *tree = root->ref_tree;
123
124 if (shared)
125 tree = &root->fs_info->shared_ref_tree;
126 if (!tree)
127 return 0;
128
129 spin_lock(&tree->lock);
130 while (!list_empty(&tree->list)) {
131 ref = list_entry(tree->list.next, struct btrfs_leaf_ref, list);
132 BUG_ON(ref->tree != tree);
133 if (ref->root_gen > max_root_gen)
134 break;
135 if (!xchg(&ref->in_tree, 0)) {
136 cond_resched_lock(&tree->lock);
137 continue;
138 }
139
140 rb_erase(&ref->rb_node, &tree->root);
141 list_del_init(&ref->list);
142
143 spin_unlock(&tree->lock);
144 btrfs_free_leaf_ref(root, ref);
145 cond_resched();
146 spin_lock(&tree->lock);
147 }
148 spin_unlock(&tree->lock);
149 return 0;
150}
151
152/*
153 * find the leaf ref for a given extent. This returns the ref struct with
154 * a usage reference incremented
155 */
156struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root,
157 u64 bytenr)
158{
159 struct rb_node *rb;
160 struct btrfs_leaf_ref *ref = NULL;
161 struct btrfs_leaf_ref_tree *tree = root->ref_tree;
162again:
163 if (tree) {
164 spin_lock(&tree->lock);
165 rb = tree_search(&tree->root, bytenr);
166 if (rb)
167 ref = rb_entry(rb, struct btrfs_leaf_ref, rb_node);
168 if (ref)
169 atomic_inc(&ref->usage);
170 spin_unlock(&tree->lock);
171 if (ref)
172 return ref;
173 }
174 if (tree != &root->fs_info->shared_ref_tree) {
175 tree = &root->fs_info->shared_ref_tree;
176 goto again;
177 }
178 return NULL;
179}
180
181/*
182 * add a fully filled in leaf ref struct
183 * remove all the refs older than a given root generation
184 */
185int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref,
186 int shared)
187{
188 int ret = 0;
189 struct rb_node *rb;
190 struct btrfs_leaf_ref_tree *tree = root->ref_tree;
191
192 if (shared)
193 tree = &root->fs_info->shared_ref_tree;
194
195 spin_lock(&tree->lock);
196 rb = tree_insert(&tree->root, ref->bytenr, &ref->rb_node);
197 if (rb) {
198 ret = -EEXIST;
199 } else {
200 atomic_inc(&ref->usage);
201 ref->tree = tree;
202 ref->in_tree = 1;
203 list_add_tail(&ref->list, &tree->list);
204 }
205 spin_unlock(&tree->lock);
206 return ret;
207}
208
209/*
210 * remove a single leaf ref from the tree. This drops the ref held by the tree
211 * only
212 */
213int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
214{
215 struct btrfs_leaf_ref_tree *tree;
216
217 if (!xchg(&ref->in_tree, 0))
218 return 0;
219
220 tree = ref->tree;
221 spin_lock(&tree->lock);
222
223 rb_erase(&ref->rb_node, &tree->root);
224 list_del_init(&ref->list);
225
226 spin_unlock(&tree->lock);
227
228 btrfs_free_leaf_ref(root, ref);
229 return 0;
230}
diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h
new file mode 100644
index 000000000000..16f3183d7c59
--- /dev/null
+++ b/fs/btrfs/ref-cache.h
@@ -0,0 +1,77 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18#ifndef __REFCACHE__
19#define __REFCACHE__
20
21struct btrfs_extent_info {
22 /* bytenr and num_bytes find the extent in the extent allocation tree */
23 u64 bytenr;
24 u64 num_bytes;
25
26 /* objectid and offset find the back reference for the file */
27 u64 objectid;
28 u64 offset;
29};
30
31struct btrfs_leaf_ref {
32 struct rb_node rb_node;
33 struct btrfs_leaf_ref_tree *tree;
34 int in_tree;
35 atomic_t usage;
36
37 u64 root_gen;
38 u64 bytenr;
39 u64 owner;
40 u64 generation;
41 int nritems;
42
43 struct list_head list;
44 struct btrfs_extent_info extents[];
45};
46
47static inline size_t btrfs_leaf_ref_size(int nr_extents)
48{
49 return sizeof(struct btrfs_leaf_ref) +
50 sizeof(struct btrfs_extent_info) * nr_extents;
51}
52
53static inline void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree)
54{
55 tree->root.rb_node = NULL;
56 INIT_LIST_HEAD(&tree->list);
57 spin_lock_init(&tree->lock);
58}
59
60static inline int btrfs_leaf_ref_tree_empty(struct btrfs_leaf_ref_tree *tree)
61{
62 return RB_EMPTY_ROOT(&tree->root);
63}
64
65void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree);
66struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root,
67 int nr_extents);
68void btrfs_free_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
69struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root,
70 u64 bytenr);
71int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref,
72 int shared);
73int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen,
74 int shared);
75int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
76
77#endif
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
new file mode 100644
index 000000000000..b48650de4472
--- /dev/null
+++ b/fs/btrfs/root-tree.c
@@ -0,0 +1,366 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include "ctree.h"
20#include "transaction.h"
21#include "disk-io.h"
22#include "print-tree.h"
23
24/*
25 * search forward for a root, starting with objectid 'search_start'
26 * if a root key is found, the objectid we find is filled into 'found_objectid'
27 * and 0 is returned. < 0 is returned on error, 1 if there is nothing
28 * left in the tree.
29 */
30int btrfs_search_root(struct btrfs_root *root, u64 search_start,
31 u64 *found_objectid)
32{
33 struct btrfs_path *path;
34 struct btrfs_key search_key;
35 int ret;
36
37 root = root->fs_info->tree_root;
38 search_key.objectid = search_start;
39 search_key.type = (u8)-1;
40 search_key.offset = (u64)-1;
41
42 path = btrfs_alloc_path();
43 BUG_ON(!path);
44again:
45 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
46 if (ret < 0)
47 goto out;
48 if (ret == 0) {
49 ret = 1;
50 goto out;
51 }
52 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
53 ret = btrfs_next_leaf(root, path);
54 if (ret)
55 goto out;
56 }
57 btrfs_item_key_to_cpu(path->nodes[0], &search_key, path->slots[0]);
58 if (search_key.type != BTRFS_ROOT_ITEM_KEY) {
59 search_key.offset++;
60 btrfs_release_path(root, path);
61 goto again;
62 }
63 ret = 0;
64 *found_objectid = search_key.objectid;
65
66out:
67 btrfs_free_path(path);
68 return ret;
69}
70
71/*
72 * lookup the root with the highest offset for a given objectid. The key we do
73 * find is copied into 'key'. If we find something return 0, otherwise 1, < 0
74 * on error.
75 */
76int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
77 struct btrfs_root_item *item, struct btrfs_key *key)
78{
79 struct btrfs_path *path;
80 struct btrfs_key search_key;
81 struct btrfs_key found_key;
82 struct extent_buffer *l;
83 int ret;
84 int slot;
85
86 search_key.objectid = objectid;
87 search_key.type = BTRFS_ROOT_ITEM_KEY;
88 search_key.offset = (u64)-1;
89
90 path = btrfs_alloc_path();
91 BUG_ON(!path);
92 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
93 if (ret < 0)
94 goto out;
95
96 BUG_ON(ret == 0);
97 l = path->nodes[0];
98 BUG_ON(path->slots[0] == 0);
99 slot = path->slots[0] - 1;
100 btrfs_item_key_to_cpu(l, &found_key, slot);
101 if (found_key.objectid != objectid) {
102 ret = 1;
103 goto out;
104 }
105 read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot),
106 sizeof(*item));
107 memcpy(key, &found_key, sizeof(found_key));
108 ret = 0;
109out:
110 btrfs_free_path(path);
111 return ret;
112}
113
114/*
115 * copy the data in 'item' into the btree
116 */
117int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
118 *root, struct btrfs_key *key, struct btrfs_root_item
119 *item)
120{
121 struct btrfs_path *path;
122 struct extent_buffer *l;
123 int ret;
124 int slot;
125 unsigned long ptr;
126
127 path = btrfs_alloc_path();
128 BUG_ON(!path);
129 ret = btrfs_search_slot(trans, root, key, path, 0, 1);
130 if (ret < 0)
131 goto out;
132
133 if (ret != 0) {
134 btrfs_print_leaf(root, path->nodes[0]);
135 printk(KERN_CRIT "unable to update root key %llu %u %llu\n",
136 (unsigned long long)key->objectid, key->type,
137 (unsigned long long)key->offset);
138 BUG_ON(1);
139 }
140
141 l = path->nodes[0];
142 slot = path->slots[0];
143 ptr = btrfs_item_ptr_offset(l, slot);
144 write_extent_buffer(l, item, ptr, sizeof(*item));
145 btrfs_mark_buffer_dirty(path->nodes[0]);
146out:
147 btrfs_release_path(root, path);
148 btrfs_free_path(path);
149 return ret;
150}
151
152int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
153 *root, struct btrfs_key *key, struct btrfs_root_item
154 *item)
155{
156 int ret;
157 ret = btrfs_insert_item(trans, root, key, item, sizeof(*item));
158 return ret;
159}
160
161/*
162 * at mount time we want to find all the old transaction snapshots that were in
163 * the process of being deleted if we crashed. This is any root item with an
164 * offset lower than the latest root. They need to be queued for deletion to
165 * finish what was happening when we crashed.
166 */
167int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid,
168 struct btrfs_root *latest)
169{
170 struct btrfs_root *dead_root;
171 struct btrfs_item *item;
172 struct btrfs_root_item *ri;
173 struct btrfs_key key;
174 struct btrfs_key found_key;
175 struct btrfs_path *path;
176 int ret;
177 u32 nritems;
178 struct extent_buffer *leaf;
179 int slot;
180
181 key.objectid = objectid;
182 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
183 key.offset = 0;
184 path = btrfs_alloc_path();
185 if (!path)
186 return -ENOMEM;
187
188again:
189 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
190 if (ret < 0)
191 goto err;
192 while (1) {
193 leaf = path->nodes[0];
194 nritems = btrfs_header_nritems(leaf);
195 slot = path->slots[0];
196 if (slot >= nritems) {
197 ret = btrfs_next_leaf(root, path);
198 if (ret)
199 break;
200 leaf = path->nodes[0];
201 nritems = btrfs_header_nritems(leaf);
202 slot = path->slots[0];
203 }
204 item = btrfs_item_nr(leaf, slot);
205 btrfs_item_key_to_cpu(leaf, &key, slot);
206 if (btrfs_key_type(&key) != BTRFS_ROOT_ITEM_KEY)
207 goto next;
208
209 if (key.objectid < objectid)
210 goto next;
211
212 if (key.objectid > objectid)
213 break;
214
215 ri = btrfs_item_ptr(leaf, slot, struct btrfs_root_item);
216 if (btrfs_disk_root_refs(leaf, ri) != 0)
217 goto next;
218
219 memcpy(&found_key, &key, sizeof(key));
220 key.offset++;
221 btrfs_release_path(root, path);
222 dead_root =
223 btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
224 &found_key);
225 if (IS_ERR(dead_root)) {
226 ret = PTR_ERR(dead_root);
227 goto err;
228 }
229
230 if (objectid == BTRFS_TREE_RELOC_OBJECTID)
231 ret = btrfs_add_dead_reloc_root(dead_root);
232 else
233 ret = btrfs_add_dead_root(dead_root, latest);
234 if (ret)
235 goto err;
236 goto again;
237next:
238 slot++;
239 path->slots[0]++;
240 }
241 ret = 0;
242err:
243 btrfs_free_path(path);
244 return ret;
245}
246
247/* drop the root item for 'key' from 'root' */
248int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
249 struct btrfs_key *key)
250{
251 struct btrfs_path *path;
252 int ret;
253 u32 refs;
254 struct btrfs_root_item *ri;
255 struct extent_buffer *leaf;
256
257 path = btrfs_alloc_path();
258 BUG_ON(!path);
259 ret = btrfs_search_slot(trans, root, key, path, -1, 1);
260 if (ret < 0)
261 goto out;
262
263 BUG_ON(ret != 0);
264 leaf = path->nodes[0];
265 ri = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_item);
266
267 refs = btrfs_disk_root_refs(leaf, ri);
268 BUG_ON(refs != 0);
269 ret = btrfs_del_item(trans, root, path);
270out:
271 btrfs_release_path(root, path);
272 btrfs_free_path(path);
273 return ret;
274}
275
276#if 0 /* this will get used when snapshot deletion is implemented */
277int btrfs_del_root_ref(struct btrfs_trans_handle *trans,
278 struct btrfs_root *tree_root,
279 u64 root_id, u8 type, u64 ref_id)
280{
281 struct btrfs_key key;
282 int ret;
283 struct btrfs_path *path;
284
285 path = btrfs_alloc_path();
286
287 key.objectid = root_id;
288 key.type = type;
289 key.offset = ref_id;
290
291 ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
292 BUG_ON(ret);
293
294 ret = btrfs_del_item(trans, tree_root, path);
295 BUG_ON(ret);
296
297 btrfs_free_path(path);
298 return ret;
299}
300#endif
301
302int btrfs_find_root_ref(struct btrfs_root *tree_root,
303 struct btrfs_path *path,
304 u64 root_id, u64 ref_id)
305{
306 struct btrfs_key key;
307 int ret;
308
309 key.objectid = root_id;
310 key.type = BTRFS_ROOT_REF_KEY;
311 key.offset = ref_id;
312
313 ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
314 return ret;
315}
316
317
318/*
319 * add a btrfs_root_ref item. type is either BTRFS_ROOT_REF_KEY
320 * or BTRFS_ROOT_BACKREF_KEY.
321 *
322 * The dirid, sequence, name and name_len refer to the directory entry
323 * that is referencing the root.
324 *
325 * For a forward ref, the root_id is the id of the tree referencing
326 * the root and ref_id is the id of the subvol or snapshot.
327 *
328 * For a back ref the root_id is the id of the subvol or snapshot and
329 * ref_id is the id of the tree referencing it.
330 */
331int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
332 struct btrfs_root *tree_root,
333 u64 root_id, u8 type, u64 ref_id,
334 u64 dirid, u64 sequence,
335 const char *name, int name_len)
336{
337 struct btrfs_key key;
338 int ret;
339 struct btrfs_path *path;
340 struct btrfs_root_ref *ref;
341 struct extent_buffer *leaf;
342 unsigned long ptr;
343
344
345 path = btrfs_alloc_path();
346
347 key.objectid = root_id;
348 key.type = type;
349 key.offset = ref_id;
350
351 ret = btrfs_insert_empty_item(trans, tree_root, path, &key,
352 sizeof(*ref) + name_len);
353 BUG_ON(ret);
354
355 leaf = path->nodes[0];
356 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
357 btrfs_set_root_ref_dirid(leaf, ref, dirid);
358 btrfs_set_root_ref_sequence(leaf, ref, sequence);
359 btrfs_set_root_ref_name_len(leaf, ref, name_len);
360 ptr = (unsigned long)(ref + 1);
361 write_extent_buffer(leaf, name, ptr, name_len);
362 btrfs_mark_buffer_dirty(leaf);
363
364 btrfs_free_path(path);
365 return ret;
366}
diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c
new file mode 100644
index 000000000000..c0f7ecaf1e79
--- /dev/null
+++ b/fs/btrfs/struct-funcs.c
@@ -0,0 +1,139 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/highmem.h>
20
21/* this is some deeply nasty code. ctree.h has a different
22 * definition for this BTRFS_SETGET_FUNCS macro, behind a #ifndef
23 *
24 * The end result is that anyone who #includes ctree.h gets a
25 * declaration for the btrfs_set_foo functions and btrfs_foo functions
26 *
27 * This file declares the macros and then #includes ctree.h, which results
28 * in cpp creating the function here based on the template below.
29 *
30 * These setget functions do all the extent_buffer related mapping
31 * required to efficiently read and write specific fields in the extent
32 * buffers. Every pointer to metadata items in btrfs is really just
33 * an unsigned long offset into the extent buffer which has been
34 * cast to a specific type. This gives us all the gcc type checking.
35 *
36 * The extent buffer api is used to do all the kmapping and page
37 * spanning work required to get extent buffers in highmem and have
38 * a metadata blocksize different from the page size.
39 *
40 * The macro starts with a simple function prototype declaration so that
41 * sparse won't complain about it being static.
42 */
43
44#define BTRFS_SETGET_FUNCS(name, type, member, bits) \
45u##bits btrfs_##name(struct extent_buffer *eb, type *s); \
46void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val); \
47u##bits btrfs_##name(struct extent_buffer *eb, \
48 type *s) \
49{ \
50 unsigned long part_offset = (unsigned long)s; \
51 unsigned long offset = part_offset + offsetof(type, member); \
52 type *p; \
53 /* ugly, but we want the fast path here */ \
54 if (eb->map_token && offset >= eb->map_start && \
55 offset + sizeof(((type *)0)->member) <= eb->map_start + \
56 eb->map_len) { \
57 p = (type *)(eb->kaddr + part_offset - eb->map_start); \
58 return le##bits##_to_cpu(p->member); \
59 } \
60 { \
61 int err; \
62 char *map_token; \
63 char *kaddr; \
64 int unmap_on_exit = (eb->map_token == NULL); \
65 unsigned long map_start; \
66 unsigned long map_len; \
67 u##bits res; \
68 err = map_extent_buffer(eb, offset, \
69 sizeof(((type *)0)->member), \
70 &map_token, &kaddr, \
71 &map_start, &map_len, KM_USER1); \
72 if (err) { \
73 __le##bits leres; \
74 read_eb_member(eb, s, type, member, &leres); \
75 return le##bits##_to_cpu(leres); \
76 } \
77 p = (type *)(kaddr + part_offset - map_start); \
78 res = le##bits##_to_cpu(p->member); \
79 if (unmap_on_exit) \
80 unmap_extent_buffer(eb, map_token, KM_USER1); \
81 return res; \
82 } \
83} \
84void btrfs_set_##name(struct extent_buffer *eb, \
85 type *s, u##bits val) \
86{ \
87 unsigned long part_offset = (unsigned long)s; \
88 unsigned long offset = part_offset + offsetof(type, member); \
89 type *p; \
90 /* ugly, but we want the fast path here */ \
91 if (eb->map_token && offset >= eb->map_start && \
92 offset + sizeof(((type *)0)->member) <= eb->map_start + \
93 eb->map_len) { \
94 p = (type *)(eb->kaddr + part_offset - eb->map_start); \
95 p->member = cpu_to_le##bits(val); \
96 return; \
97 } \
98 { \
99 int err; \
100 char *map_token; \
101 char *kaddr; \
102 int unmap_on_exit = (eb->map_token == NULL); \
103 unsigned long map_start; \
104 unsigned long map_len; \
105 err = map_extent_buffer(eb, offset, \
106 sizeof(((type *)0)->member), \
107 &map_token, &kaddr, \
108 &map_start, &map_len, KM_USER1); \
109 if (err) { \
110 __le##bits val2; \
111 val2 = cpu_to_le##bits(val); \
112 write_eb_member(eb, s, type, member, &val2); \
113 return; \
114 } \
115 p = (type *)(kaddr + part_offset - map_start); \
116 p->member = cpu_to_le##bits(val); \
117 if (unmap_on_exit) \
118 unmap_extent_buffer(eb, map_token, KM_USER1); \
119 } \
120}
121
122#include "ctree.h"
123
124void btrfs_node_key(struct extent_buffer *eb,
125 struct btrfs_disk_key *disk_key, int nr)
126{
127 unsigned long ptr = btrfs_node_key_ptr_offset(nr);
128 if (eb->map_token && ptr >= eb->map_start &&
129 ptr + sizeof(*disk_key) <= eb->map_start + eb->map_len) {
130 memcpy(disk_key, eb->kaddr + ptr - eb->map_start,
131 sizeof(*disk_key));
132 return;
133 } else if (eb->map_token) {
134 unmap_extent_buffer(eb, eb->map_token, KM_USER1);
135 eb->map_token = NULL;
136 }
137 read_eb_member(eb, (struct btrfs_key_ptr *)ptr,
138 struct btrfs_key_ptr, key, disk_key);
139}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
new file mode 100644
index 000000000000..db9fb3bc1e33
--- /dev/null
+++ b/fs/btrfs/super.c
@@ -0,0 +1,723 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/blkdev.h>
20#include <linux/module.h>
21#include <linux/buffer_head.h>
22#include <linux/fs.h>
23#include <linux/pagemap.h>
24#include <linux/highmem.h>
25#include <linux/time.h>
26#include <linux/init.h>
27#include <linux/string.h>
28#include <linux/smp_lock.h>
29#include <linux/backing-dev.h>
30#include <linux/mount.h>
31#include <linux/mpage.h>
32#include <linux/swap.h>
33#include <linux/writeback.h>
34#include <linux/statfs.h>
35#include <linux/compat.h>
36#include <linux/parser.h>
37#include <linux/ctype.h>
38#include <linux/namei.h>
39#include <linux/miscdevice.h>
40#include <linux/version.h>
41#include <linux/magic.h>
42#include "compat.h"
43#include "ctree.h"
44#include "disk-io.h"
45#include "transaction.h"
46#include "btrfs_inode.h"
47#include "ioctl.h"
48#include "print-tree.h"
49#include "xattr.h"
50#include "volumes.h"
51#include "version.h"
52#include "export.h"
53#include "compression.h"
54
55
56static struct super_operations btrfs_super_ops;
57
58static void btrfs_put_super(struct super_block *sb)
59{
60 struct btrfs_root *root = btrfs_sb(sb);
61 int ret;
62
63 ret = close_ctree(root);
64 sb->s_fs_info = NULL;
65}
66
67enum {
68 Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow,
69 Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
70 Opt_ssd, Opt_thread_pool, Opt_noacl, Opt_compress, Opt_err,
71};
72
73static match_table_t tokens = {
74 {Opt_degraded, "degraded"},
75 {Opt_subvol, "subvol=%s"},
76 {Opt_device, "device=%s"},
77 {Opt_nodatasum, "nodatasum"},
78 {Opt_nodatacow, "nodatacow"},
79 {Opt_nobarrier, "nobarrier"},
80 {Opt_max_extent, "max_extent=%s"},
81 {Opt_max_inline, "max_inline=%s"},
82 {Opt_alloc_start, "alloc_start=%s"},
83 {Opt_thread_pool, "thread_pool=%d"},
84 {Opt_compress, "compress"},
85 {Opt_ssd, "ssd"},
86 {Opt_noacl, "noacl"},
87 {Opt_err, NULL},
88};
89
90u64 btrfs_parse_size(char *str)
91{
92 u64 res;
93 int mult = 1;
94 char *end;
95 char last;
96
97 res = simple_strtoul(str, &end, 10);
98
99 last = end[0];
100 if (isalpha(last)) {
101 last = tolower(last);
102 switch (last) {
103 case 'g':
104 mult *= 1024;
105 case 'm':
106 mult *= 1024;
107 case 'k':
108 mult *= 1024;
109 }
110 res = res * mult;
111 }
112 return res;
113}
114
115/*
116 * Regular mount options parser. Everything that is needed only when
117 * reading in a new superblock is parsed here.
118 */
119int btrfs_parse_options(struct btrfs_root *root, char *options)
120{
121 struct btrfs_fs_info *info = root->fs_info;
122 substring_t args[MAX_OPT_ARGS];
123 char *p, *num;
124 int intarg;
125
126 if (!options)
127 return 0;
128
129 /*
130 * strsep changes the string, duplicate it because parse_options
131 * gets called twice
132 */
133 options = kstrdup(options, GFP_NOFS);
134 if (!options)
135 return -ENOMEM;
136
137
138 while ((p = strsep(&options, ",")) != NULL) {
139 int token;
140 if (!*p)
141 continue;
142
143 token = match_token(p, tokens, args);
144 switch (token) {
145 case Opt_degraded:
146 printk(KERN_INFO "btrfs: allowing degraded mounts\n");
147 btrfs_set_opt(info->mount_opt, DEGRADED);
148 break;
149 case Opt_subvol:
150 case Opt_device:
151 /*
152 * These are parsed by btrfs_parse_early_options
153 * and can be happily ignored here.
154 */
155 break;
156 case Opt_nodatasum:
157 printk(KERN_INFO "btrfs: setting nodatacsum\n");
158 btrfs_set_opt(info->mount_opt, NODATASUM);
159 break;
160 case Opt_nodatacow:
161 printk(KERN_INFO "btrfs: setting nodatacow\n");
162 btrfs_set_opt(info->mount_opt, NODATACOW);
163 btrfs_set_opt(info->mount_opt, NODATASUM);
164 break;
165 case Opt_compress:
166 printk(KERN_INFO "btrfs: use compression\n");
167 btrfs_set_opt(info->mount_opt, COMPRESS);
168 break;
169 case Opt_ssd:
170 printk(KERN_INFO "btrfs: use ssd allocation scheme\n");
171 btrfs_set_opt(info->mount_opt, SSD);
172 break;
173 case Opt_nobarrier:
174 printk(KERN_INFO "btrfs: turning off barriers\n");
175 btrfs_set_opt(info->mount_opt, NOBARRIER);
176 break;
177 case Opt_thread_pool:
178 intarg = 0;
179 match_int(&args[0], &intarg);
180 if (intarg) {
181 info->thread_pool_size = intarg;
182 printk(KERN_INFO "btrfs: thread pool %d\n",
183 info->thread_pool_size);
184 }
185 break;
186 case Opt_max_extent:
187 num = match_strdup(&args[0]);
188 if (num) {
189 info->max_extent = btrfs_parse_size(num);
190 kfree(num);
191
192 info->max_extent = max_t(u64,
193 info->max_extent, root->sectorsize);
194 printk(KERN_INFO "btrfs: max_extent at %llu\n",
195 info->max_extent);
196 }
197 break;
198 case Opt_max_inline:
199 num = match_strdup(&args[0]);
200 if (num) {
201 info->max_inline = btrfs_parse_size(num);
202 kfree(num);
203
204 if (info->max_inline) {
205 info->max_inline = max_t(u64,
206 info->max_inline,
207 root->sectorsize);
208 }
209 printk(KERN_INFO "btrfs: max_inline at %llu\n",
210 info->max_inline);
211 }
212 break;
213 case Opt_alloc_start:
214 num = match_strdup(&args[0]);
215 if (num) {
216 info->alloc_start = btrfs_parse_size(num);
217 kfree(num);
218 printk(KERN_INFO
219 "btrfs: allocations start at %llu\n",
220 info->alloc_start);
221 }
222 break;
223 case Opt_noacl:
224 root->fs_info->sb->s_flags &= ~MS_POSIXACL;
225 break;
226 default:
227 break;
228 }
229 }
230 kfree(options);
231 return 0;
232}
233
234/*
235 * Parse mount options that are required early in the mount process.
236 *
237 * All other options will be parsed on much later in the mount process and
238 * only when we need to allocate a new super block.
239 */
240static int btrfs_parse_early_options(const char *options, fmode_t flags,
241 void *holder, char **subvol_name,
242 struct btrfs_fs_devices **fs_devices)
243{
244 substring_t args[MAX_OPT_ARGS];
245 char *opts, *p;
246 int error = 0;
247
248 if (!options)
249 goto out;
250
251 /*
252 * strsep changes the string, duplicate it because parse_options
253 * gets called twice
254 */
255 opts = kstrdup(options, GFP_KERNEL);
256 if (!opts)
257 return -ENOMEM;
258
259 while ((p = strsep(&opts, ",")) != NULL) {
260 int token;
261 if (!*p)
262 continue;
263
264 token = match_token(p, tokens, args);
265 switch (token) {
266 case Opt_subvol:
267 *subvol_name = match_strdup(&args[0]);
268 break;
269 case Opt_device:
270 error = btrfs_scan_one_device(match_strdup(&args[0]),
271 flags, holder, fs_devices);
272 if (error)
273 goto out_free_opts;
274 break;
275 default:
276 break;
277 }
278 }
279
280 out_free_opts:
281 kfree(opts);
282 out:
283 /*
284 * If no subvolume name is specified we use the default one. Allocate
285 * a copy of the string "." here so that code later in the
286 * mount path doesn't care if it's the default volume or another one.
287 */
288 if (!*subvol_name) {
289 *subvol_name = kstrdup(".", GFP_KERNEL);
290 if (!*subvol_name)
291 return -ENOMEM;
292 }
293 return error;
294}
295
296static int btrfs_fill_super(struct super_block *sb,
297 struct btrfs_fs_devices *fs_devices,
298 void *data, int silent)
299{
300 struct inode *inode;
301 struct dentry *root_dentry;
302 struct btrfs_super_block *disk_super;
303 struct btrfs_root *tree_root;
304 struct btrfs_inode *bi;
305 int err;
306
307 sb->s_maxbytes = MAX_LFS_FILESIZE;
308 sb->s_magic = BTRFS_SUPER_MAGIC;
309 sb->s_op = &btrfs_super_ops;
310 sb->s_export_op = &btrfs_export_ops;
311 sb->s_xattr = btrfs_xattr_handlers;
312 sb->s_time_gran = 1;
313 sb->s_flags |= MS_POSIXACL;
314
315 tree_root = open_ctree(sb, fs_devices, (char *)data);
316
317 if (IS_ERR(tree_root)) {
318 printk("btrfs: open_ctree failed\n");
319 return PTR_ERR(tree_root);
320 }
321 sb->s_fs_info = tree_root;
322 disk_super = &tree_root->fs_info->super_copy;
323 inode = btrfs_iget_locked(sb, BTRFS_FIRST_FREE_OBJECTID,
324 tree_root->fs_info->fs_root);
325 bi = BTRFS_I(inode);
326 bi->location.objectid = inode->i_ino;
327 bi->location.offset = 0;
328 bi->root = tree_root->fs_info->fs_root;
329
330 btrfs_set_key_type(&bi->location, BTRFS_INODE_ITEM_KEY);
331
332 if (!inode) {
333 err = -ENOMEM;
334 goto fail_close;
335 }
336 if (inode->i_state & I_NEW) {
337 btrfs_read_locked_inode(inode);
338 unlock_new_inode(inode);
339 }
340
341 root_dentry = d_alloc_root(inode);
342 if (!root_dentry) {
343 iput(inode);
344 err = -ENOMEM;
345 goto fail_close;
346 }
347#if 0
348 /* this does the super kobj at the same time */
349 err = btrfs_sysfs_add_super(tree_root->fs_info);
350 if (err)
351 goto fail_close;
352#endif
353
354 sb->s_root = root_dentry;
355
356 save_mount_options(sb, data);
357 return 0;
358
359fail_close:
360 close_ctree(tree_root);
361 return err;
362}
363
364int btrfs_sync_fs(struct super_block *sb, int wait)
365{
366 struct btrfs_trans_handle *trans;
367 struct btrfs_root *root;
368 int ret;
369 root = btrfs_sb(sb);
370
371 if (sb->s_flags & MS_RDONLY)
372 return 0;
373
374 sb->s_dirt = 0;
375 if (!wait) {
376 filemap_flush(root->fs_info->btree_inode->i_mapping);
377 return 0;
378 }
379
380 btrfs_start_delalloc_inodes(root);
381 btrfs_wait_ordered_extents(root, 0);
382
383 btrfs_clean_old_snapshots(root);
384 trans = btrfs_start_transaction(root, 1);
385 ret = btrfs_commit_transaction(trans, root);
386 sb->s_dirt = 0;
387 return ret;
388}
389
390static void btrfs_write_super(struct super_block *sb)
391{
392 sb->s_dirt = 0;
393}
394
395static int btrfs_test_super(struct super_block *s, void *data)
396{
397 struct btrfs_fs_devices *test_fs_devices = data;
398 struct btrfs_root *root = btrfs_sb(s);
399
400 return root->fs_info->fs_devices == test_fs_devices;
401}
402
403/*
404 * Find a superblock for the given device / mount point.
405 *
406 * Note: This is based on get_sb_bdev from fs/super.c with a few additions
407 * for multiple device setup. Make sure to keep it in sync.
408 */
409static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
410 const char *dev_name, void *data, struct vfsmount *mnt)
411{
412 char *subvol_name = NULL;
413 struct block_device *bdev = NULL;
414 struct super_block *s;
415 struct dentry *root;
416 struct btrfs_fs_devices *fs_devices = NULL;
417 fmode_t mode = FMODE_READ;
418 int error = 0;
419
420 if (!(flags & MS_RDONLY))
421 mode |= FMODE_WRITE;
422
423 error = btrfs_parse_early_options(data, mode, fs_type,
424 &subvol_name, &fs_devices);
425 if (error)
426 return error;
427
428 error = btrfs_scan_one_device(dev_name, mode, fs_type, &fs_devices);
429 if (error)
430 goto error_free_subvol_name;
431
432 error = btrfs_open_devices(fs_devices, mode, fs_type);
433 if (error)
434 goto error_free_subvol_name;
435
436 if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) {
437 error = -EACCES;
438 goto error_close_devices;
439 }
440
441 bdev = fs_devices->latest_bdev;
442 s = sget(fs_type, btrfs_test_super, set_anon_super, fs_devices);
443 if (IS_ERR(s))
444 goto error_s;
445
446 if (s->s_root) {
447 if ((flags ^ s->s_flags) & MS_RDONLY) {
448 up_write(&s->s_umount);
449 deactivate_super(s);
450 error = -EBUSY;
451 goto error_close_devices;
452 }
453
454 btrfs_close_devices(fs_devices);
455 } else {
456 char b[BDEVNAME_SIZE];
457
458 s->s_flags = flags;
459 strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
460 error = btrfs_fill_super(s, fs_devices, data,
461 flags & MS_SILENT ? 1 : 0);
462 if (error) {
463 up_write(&s->s_umount);
464 deactivate_super(s);
465 goto error_free_subvol_name;
466 }
467
468 btrfs_sb(s)->fs_info->bdev_holder = fs_type;
469 s->s_flags |= MS_ACTIVE;
470 }
471
472 if (!strcmp(subvol_name, "."))
473 root = dget(s->s_root);
474 else {
475 mutex_lock(&s->s_root->d_inode->i_mutex);
476 root = lookup_one_len(subvol_name, s->s_root,
477 strlen(subvol_name));
478 mutex_unlock(&s->s_root->d_inode->i_mutex);
479
480 if (IS_ERR(root)) {
481 up_write(&s->s_umount);
482 deactivate_super(s);
483 error = PTR_ERR(root);
484 goto error_free_subvol_name;
485 }
486 if (!root->d_inode) {
487 dput(root);
488 up_write(&s->s_umount);
489 deactivate_super(s);
490 error = -ENXIO;
491 goto error_free_subvol_name;
492 }
493 }
494
495 mnt->mnt_sb = s;
496 mnt->mnt_root = root;
497
498 kfree(subvol_name);
499 return 0;
500
501error_s:
502 error = PTR_ERR(s);
503error_close_devices:
504 btrfs_close_devices(fs_devices);
505error_free_subvol_name:
506 kfree(subvol_name);
507 return error;
508}
509
510static int btrfs_remount(struct super_block *sb, int *flags, char *data)
511{
512 struct btrfs_root *root = btrfs_sb(sb);
513 int ret;
514
515 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
516 return 0;
517
518 if (*flags & MS_RDONLY) {
519 sb->s_flags |= MS_RDONLY;
520
521 ret = btrfs_commit_super(root);
522 WARN_ON(ret);
523 } else {
524 if (root->fs_info->fs_devices->rw_devices == 0)
525 return -EACCES;
526
527 if (btrfs_super_log_root(&root->fs_info->super_copy) != 0)
528 return -EINVAL;
529
530 ret = btrfs_cleanup_reloc_trees(root);
531 WARN_ON(ret);
532
533 ret = btrfs_cleanup_fs_roots(root->fs_info);
534 WARN_ON(ret);
535
536 sb->s_flags &= ~MS_RDONLY;
537 }
538
539 return 0;
540}
541
542static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
543{
544 struct btrfs_root *root = btrfs_sb(dentry->d_sb);
545 struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
546 int bits = dentry->d_sb->s_blocksize_bits;
547 __be32 *fsid = (__be32 *)root->fs_info->fsid;
548
549 buf->f_namelen = BTRFS_NAME_LEN;
550 buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
551 buf->f_bfree = buf->f_blocks -
552 (btrfs_super_bytes_used(disk_super) >> bits);
553 buf->f_bavail = buf->f_bfree;
554 buf->f_bsize = dentry->d_sb->s_blocksize;
555 buf->f_type = BTRFS_SUPER_MAGIC;
556
557 /* We treat it as constant endianness (it doesn't matter _which_)
558 because we want the fsid to come out the same whether mounted
559 on a big-endian or little-endian host */
560 buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]);
561 buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]);
562 /* Mask in the root object ID too, to disambiguate subvols */
563 buf->f_fsid.val[0] ^= BTRFS_I(dentry->d_inode)->root->objectid >> 32;
564 buf->f_fsid.val[1] ^= BTRFS_I(dentry->d_inode)->root->objectid;
565
566 return 0;
567}
568
569static struct file_system_type btrfs_fs_type = {
570 .owner = THIS_MODULE,
571 .name = "btrfs",
572 .get_sb = btrfs_get_sb,
573 .kill_sb = kill_anon_super,
574 .fs_flags = FS_REQUIRES_DEV,
575};
576
577/*
578 * used by btrfsctl to scan devices when no FS is mounted
579 */
580static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
581 unsigned long arg)
582{
583 struct btrfs_ioctl_vol_args *vol;
584 struct btrfs_fs_devices *fs_devices;
585 int ret = -ENOTTY;
586 int len;
587
588 if (!capable(CAP_SYS_ADMIN))
589 return -EPERM;
590
591 vol = kmalloc(sizeof(*vol), GFP_KERNEL);
592 if (copy_from_user(vol, (void __user *)arg, sizeof(*vol))) {
593 ret = -EFAULT;
594 goto out;
595 }
596 len = strnlen(vol->name, BTRFS_PATH_NAME_MAX);
597
598 switch (cmd) {
599 case BTRFS_IOC_SCAN_DEV:
600 ret = btrfs_scan_one_device(vol->name, FMODE_READ,
601 &btrfs_fs_type, &fs_devices);
602 break;
603 }
604out:
605 kfree(vol);
606 return ret;
607}
608
609static int btrfs_freeze(struct super_block *sb)
610{
611 struct btrfs_root *root = btrfs_sb(sb);
612 mutex_lock(&root->fs_info->transaction_kthread_mutex);
613 mutex_lock(&root->fs_info->cleaner_mutex);
614 return 0;
615}
616
617static int btrfs_unfreeze(struct super_block *sb)
618{
619 struct btrfs_root *root = btrfs_sb(sb);
620 mutex_unlock(&root->fs_info->cleaner_mutex);
621 mutex_unlock(&root->fs_info->transaction_kthread_mutex);
622 return 0;
623}
624
625static struct super_operations btrfs_super_ops = {
626 .delete_inode = btrfs_delete_inode,
627 .put_super = btrfs_put_super,
628 .write_super = btrfs_write_super,
629 .sync_fs = btrfs_sync_fs,
630 .show_options = generic_show_options,
631 .write_inode = btrfs_write_inode,
632 .dirty_inode = btrfs_dirty_inode,
633 .alloc_inode = btrfs_alloc_inode,
634 .destroy_inode = btrfs_destroy_inode,
635 .statfs = btrfs_statfs,
636 .remount_fs = btrfs_remount,
637 .freeze_fs = btrfs_freeze,
638 .unfreeze_fs = btrfs_unfreeze,
639};
640
641static const struct file_operations btrfs_ctl_fops = {
642 .unlocked_ioctl = btrfs_control_ioctl,
643 .compat_ioctl = btrfs_control_ioctl,
644 .owner = THIS_MODULE,
645};
646
647static struct miscdevice btrfs_misc = {
648 .minor = MISC_DYNAMIC_MINOR,
649 .name = "btrfs-control",
650 .fops = &btrfs_ctl_fops
651};
652
653static int btrfs_interface_init(void)
654{
655 return misc_register(&btrfs_misc);
656}
657
658static void btrfs_interface_exit(void)
659{
660 if (misc_deregister(&btrfs_misc) < 0)
661 printk(KERN_INFO "misc_deregister failed for control device");
662}
663
664static int __init init_btrfs_fs(void)
665{
666 int err;
667
668 err = btrfs_init_sysfs();
669 if (err)
670 return err;
671
672 err = btrfs_init_cachep();
673 if (err)
674 goto free_sysfs;
675
676 err = extent_io_init();
677 if (err)
678 goto free_cachep;
679
680 err = extent_map_init();
681 if (err)
682 goto free_extent_io;
683
684 err = btrfs_interface_init();
685 if (err)
686 goto free_extent_map;
687
688 err = register_filesystem(&btrfs_fs_type);
689 if (err)
690 goto unregister_ioctl;
691
692 printk(KERN_INFO "%s loaded\n", BTRFS_BUILD_VERSION);
693 return 0;
694
695unregister_ioctl:
696 btrfs_interface_exit();
697free_extent_map:
698 extent_map_exit();
699free_extent_io:
700 extent_io_exit();
701free_cachep:
702 btrfs_destroy_cachep();
703free_sysfs:
704 btrfs_exit_sysfs();
705 return err;
706}
707
708static void __exit exit_btrfs_fs(void)
709{
710 btrfs_destroy_cachep();
711 extent_map_exit();
712 extent_io_exit();
713 btrfs_interface_exit();
714 unregister_filesystem(&btrfs_fs_type);
715 btrfs_exit_sysfs();
716 btrfs_cleanup_fs_uuids();
717 btrfs_zlib_exit();
718}
719
720module_init(init_btrfs_fs)
721module_exit(exit_btrfs_fs)
722
723MODULE_LICENSE("GPL");
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
new file mode 100644
index 000000000000..a240b6fa81df
--- /dev/null
+++ b/fs/btrfs/sysfs.c
@@ -0,0 +1,269 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include <linux/slab.h>
21#include <linux/spinlock.h>
22#include <linux/completion.h>
23#include <linux/buffer_head.h>
24#include <linux/module.h>
25#include <linux/kobject.h>
26
27#include "ctree.h"
28#include "disk-io.h"
29#include "transaction.h"
30
31static ssize_t root_blocks_used_show(struct btrfs_root *root, char *buf)
32{
33 return snprintf(buf, PAGE_SIZE, "%llu\n",
34 (unsigned long long)btrfs_root_used(&root->root_item));
35}
36
37static ssize_t root_block_limit_show(struct btrfs_root *root, char *buf)
38{
39 return snprintf(buf, PAGE_SIZE, "%llu\n",
40 (unsigned long long)btrfs_root_limit(&root->root_item));
41}
42
43static ssize_t super_blocks_used_show(struct btrfs_fs_info *fs, char *buf)
44{
45
46 return snprintf(buf, PAGE_SIZE, "%llu\n",
47 (unsigned long long)btrfs_super_bytes_used(&fs->super_copy));
48}
49
50static ssize_t super_total_blocks_show(struct btrfs_fs_info *fs, char *buf)
51{
52 return snprintf(buf, PAGE_SIZE, "%llu\n",
53 (unsigned long long)btrfs_super_total_bytes(&fs->super_copy));
54}
55
56static ssize_t super_blocksize_show(struct btrfs_fs_info *fs, char *buf)
57{
58 return snprintf(buf, PAGE_SIZE, "%llu\n",
59 (unsigned long long)btrfs_super_sectorsize(&fs->super_copy));
60}
61
62/* this is for root attrs (subvols/snapshots) */
63struct btrfs_root_attr {
64 struct attribute attr;
65 ssize_t (*show)(struct btrfs_root *, char *);
66 ssize_t (*store)(struct btrfs_root *, const char *, size_t);
67};
68
69#define ROOT_ATTR(name, mode, show, store) \
70static struct btrfs_root_attr btrfs_root_attr_##name = __ATTR(name, mode, \
71 show, store)
72
73ROOT_ATTR(blocks_used, 0444, root_blocks_used_show, NULL);
74ROOT_ATTR(block_limit, 0644, root_block_limit_show, NULL);
75
76static struct attribute *btrfs_root_attrs[] = {
77 &btrfs_root_attr_blocks_used.attr,
78 &btrfs_root_attr_block_limit.attr,
79 NULL,
80};
81
82/* this is for super attrs (actual full fs) */
83struct btrfs_super_attr {
84 struct attribute attr;
85 ssize_t (*show)(struct btrfs_fs_info *, char *);
86 ssize_t (*store)(struct btrfs_fs_info *, const char *, size_t);
87};
88
89#define SUPER_ATTR(name, mode, show, store) \
90static struct btrfs_super_attr btrfs_super_attr_##name = __ATTR(name, mode, \
91 show, store)
92
93SUPER_ATTR(blocks_used, 0444, super_blocks_used_show, NULL);
94SUPER_ATTR(total_blocks, 0444, super_total_blocks_show, NULL);
95SUPER_ATTR(blocksize, 0444, super_blocksize_show, NULL);
96
97static struct attribute *btrfs_super_attrs[] = {
98 &btrfs_super_attr_blocks_used.attr,
99 &btrfs_super_attr_total_blocks.attr,
100 &btrfs_super_attr_blocksize.attr,
101 NULL,
102};
103
104static ssize_t btrfs_super_attr_show(struct kobject *kobj,
105 struct attribute *attr, char *buf)
106{
107 struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info,
108 super_kobj);
109 struct btrfs_super_attr *a = container_of(attr,
110 struct btrfs_super_attr,
111 attr);
112
113 return a->show ? a->show(fs, buf) : 0;
114}
115
116static ssize_t btrfs_super_attr_store(struct kobject *kobj,
117 struct attribute *attr,
118 const char *buf, size_t len)
119{
120 struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info,
121 super_kobj);
122 struct btrfs_super_attr *a = container_of(attr,
123 struct btrfs_super_attr,
124 attr);
125
126 return a->store ? a->store(fs, buf, len) : 0;
127}
128
129static ssize_t btrfs_root_attr_show(struct kobject *kobj,
130 struct attribute *attr, char *buf)
131{
132 struct btrfs_root *root = container_of(kobj, struct btrfs_root,
133 root_kobj);
134 struct btrfs_root_attr *a = container_of(attr,
135 struct btrfs_root_attr,
136 attr);
137
138 return a->show ? a->show(root, buf) : 0;
139}
140
141static ssize_t btrfs_root_attr_store(struct kobject *kobj,
142 struct attribute *attr,
143 const char *buf, size_t len)
144{
145 struct btrfs_root *root = container_of(kobj, struct btrfs_root,
146 root_kobj);
147 struct btrfs_root_attr *a = container_of(attr,
148 struct btrfs_root_attr,
149 attr);
150 return a->store ? a->store(root, buf, len) : 0;
151}
152
153static void btrfs_super_release(struct kobject *kobj)
154{
155 struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info,
156 super_kobj);
157 complete(&fs->kobj_unregister);
158}
159
160static void btrfs_root_release(struct kobject *kobj)
161{
162 struct btrfs_root *root = container_of(kobj, struct btrfs_root,
163 root_kobj);
164 complete(&root->kobj_unregister);
165}
166
167static struct sysfs_ops btrfs_super_attr_ops = {
168 .show = btrfs_super_attr_show,
169 .store = btrfs_super_attr_store,
170};
171
172static struct sysfs_ops btrfs_root_attr_ops = {
173 .show = btrfs_root_attr_show,
174 .store = btrfs_root_attr_store,
175};
176
177static struct kobj_type btrfs_root_ktype = {
178 .default_attrs = btrfs_root_attrs,
179 .sysfs_ops = &btrfs_root_attr_ops,
180 .release = btrfs_root_release,
181};
182
183static struct kobj_type btrfs_super_ktype = {
184 .default_attrs = btrfs_super_attrs,
185 .sysfs_ops = &btrfs_super_attr_ops,
186 .release = btrfs_super_release,
187};
188
189/* /sys/fs/btrfs/ entry */
190static struct kset *btrfs_kset;
191
192int btrfs_sysfs_add_super(struct btrfs_fs_info *fs)
193{
194 int error;
195 char *name;
196 char c;
197 int len = strlen(fs->sb->s_id) + 1;
198 int i;
199
200 name = kmalloc(len, GFP_NOFS);
201 if (!name) {
202 error = -ENOMEM;
203 goto fail;
204 }
205
206 for (i = 0; i < len; i++) {
207 c = fs->sb->s_id[i];
208 if (c == '/' || c == '\\')
209 c = '!';
210 name[i] = c;
211 }
212 name[len] = '\0';
213
214 fs->super_kobj.kset = btrfs_kset;
215 error = kobject_init_and_add(&fs->super_kobj, &btrfs_super_ktype,
216 NULL, "%s", name);
217 kfree(name);
218 if (error)
219 goto fail;
220
221 return 0;
222
223fail:
224 printk(KERN_ERR "btrfs: sysfs creation for super failed\n");
225 return error;
226}
227
228int btrfs_sysfs_add_root(struct btrfs_root *root)
229{
230 int error;
231
232 error = kobject_init_and_add(&root->root_kobj, &btrfs_root_ktype,
233 &root->fs_info->super_kobj,
234 "%s", root->name);
235 if (error)
236 goto fail;
237
238 return 0;
239
240fail:
241 printk(KERN_ERR "btrfs: sysfs creation for root failed\n");
242 return error;
243}
244
245void btrfs_sysfs_del_root(struct btrfs_root *root)
246{
247 kobject_put(&root->root_kobj);
248 wait_for_completion(&root->kobj_unregister);
249}
250
251void btrfs_sysfs_del_super(struct btrfs_fs_info *fs)
252{
253 kobject_put(&fs->super_kobj);
254 wait_for_completion(&fs->kobj_unregister);
255}
256
257int btrfs_init_sysfs(void)
258{
259 btrfs_kset = kset_create_and_add("btrfs", NULL, fs_kobj);
260 if (!btrfs_kset)
261 return -ENOMEM;
262 return 0;
263}
264
265void btrfs_exit_sysfs(void)
266{
267 kset_unregister(btrfs_kset);
268}
269
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
new file mode 100644
index 000000000000..8a08f9443340
--- /dev/null
+++ b/fs/btrfs/transaction.c
@@ -0,0 +1,1097 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/fs.h>
20#include <linux/sched.h>
21#include <linux/writeback.h>
22#include <linux/pagemap.h>
23#include <linux/blkdev.h>
24#include "ctree.h"
25#include "disk-io.h"
26#include "transaction.h"
27#include "locking.h"
28#include "ref-cache.h"
29#include "tree-log.h"
30
31#define BTRFS_ROOT_TRANS_TAG 0
32
33static noinline void put_transaction(struct btrfs_transaction *transaction)
34{
35 WARN_ON(transaction->use_count == 0);
36 transaction->use_count--;
37 if (transaction->use_count == 0) {
38 list_del_init(&transaction->list);
39 memset(transaction, 0, sizeof(*transaction));
40 kmem_cache_free(btrfs_transaction_cachep, transaction);
41 }
42}
43
44/*
45 * either allocate a new transaction or hop into the existing one
46 */
47static noinline int join_transaction(struct btrfs_root *root)
48{
49 struct btrfs_transaction *cur_trans;
50 cur_trans = root->fs_info->running_transaction;
51 if (!cur_trans) {
52 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep,
53 GFP_NOFS);
54 BUG_ON(!cur_trans);
55 root->fs_info->generation++;
56 root->fs_info->last_alloc = 0;
57 root->fs_info->last_data_alloc = 0;
58 cur_trans->num_writers = 1;
59 cur_trans->num_joined = 0;
60 cur_trans->transid = root->fs_info->generation;
61 init_waitqueue_head(&cur_trans->writer_wait);
62 init_waitqueue_head(&cur_trans->commit_wait);
63 cur_trans->in_commit = 0;
64 cur_trans->blocked = 0;
65 cur_trans->use_count = 1;
66 cur_trans->commit_done = 0;
67 cur_trans->start_time = get_seconds();
68 INIT_LIST_HEAD(&cur_trans->pending_snapshots);
69 list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
70 extent_io_tree_init(&cur_trans->dirty_pages,
71 root->fs_info->btree_inode->i_mapping,
72 GFP_NOFS);
73 spin_lock(&root->fs_info->new_trans_lock);
74 root->fs_info->running_transaction = cur_trans;
75 spin_unlock(&root->fs_info->new_trans_lock);
76 } else {
77 cur_trans->num_writers++;
78 cur_trans->num_joined++;
79 }
80
81 return 0;
82}
83
84/*
85 * this does all the record keeping required to make sure that a reference
86 * counted root is properly recorded in a given transaction. This is required
87 * to make sure the old root from before we joined the transaction is deleted
88 * when the transaction commits
89 */
90noinline int btrfs_record_root_in_trans(struct btrfs_root *root)
91{
92 struct btrfs_dirty_root *dirty;
93 u64 running_trans_id = root->fs_info->running_transaction->transid;
94 if (root->ref_cows && root->last_trans < running_trans_id) {
95 WARN_ON(root == root->fs_info->extent_root);
96 if (root->root_item.refs != 0) {
97 radix_tree_tag_set(&root->fs_info->fs_roots_radix,
98 (unsigned long)root->root_key.objectid,
99 BTRFS_ROOT_TRANS_TAG);
100
101 dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
102 BUG_ON(!dirty);
103 dirty->root = kmalloc(sizeof(*dirty->root), GFP_NOFS);
104 BUG_ON(!dirty->root);
105 dirty->latest_root = root;
106 INIT_LIST_HEAD(&dirty->list);
107
108 root->commit_root = btrfs_root_node(root);
109
110 memcpy(dirty->root, root, sizeof(*root));
111 spin_lock_init(&dirty->root->node_lock);
112 spin_lock_init(&dirty->root->list_lock);
113 mutex_init(&dirty->root->objectid_mutex);
114 mutex_init(&dirty->root->log_mutex);
115 INIT_LIST_HEAD(&dirty->root->dead_list);
116 dirty->root->node = root->commit_root;
117 dirty->root->commit_root = NULL;
118
119 spin_lock(&root->list_lock);
120 list_add(&dirty->root->dead_list, &root->dead_list);
121 spin_unlock(&root->list_lock);
122
123 root->dirty_root = dirty;
124 } else {
125 WARN_ON(1);
126 }
127 root->last_trans = running_trans_id;
128 }
129 return 0;
130}
131
132/* wait for commit against the current transaction to become unblocked
133 * when this is done, it is safe to start a new transaction, but the current
134 * transaction might not be fully on disk.
135 */
136static void wait_current_trans(struct btrfs_root *root)
137{
138 struct btrfs_transaction *cur_trans;
139
140 cur_trans = root->fs_info->running_transaction;
141 if (cur_trans && cur_trans->blocked) {
142 DEFINE_WAIT(wait);
143 cur_trans->use_count++;
144 while (1) {
145 prepare_to_wait(&root->fs_info->transaction_wait, &wait,
146 TASK_UNINTERRUPTIBLE);
147 if (cur_trans->blocked) {
148 mutex_unlock(&root->fs_info->trans_mutex);
149 schedule();
150 mutex_lock(&root->fs_info->trans_mutex);
151 finish_wait(&root->fs_info->transaction_wait,
152 &wait);
153 } else {
154 finish_wait(&root->fs_info->transaction_wait,
155 &wait);
156 break;
157 }
158 }
159 put_transaction(cur_trans);
160 }
161}
162
163static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
164 int num_blocks, int wait)
165{
166 struct btrfs_trans_handle *h =
167 kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
168 int ret;
169
170 mutex_lock(&root->fs_info->trans_mutex);
171 if (!root->fs_info->log_root_recovering &&
172 ((wait == 1 && !root->fs_info->open_ioctl_trans) || wait == 2))
173 wait_current_trans(root);
174 ret = join_transaction(root);
175 BUG_ON(ret);
176
177 btrfs_record_root_in_trans(root);
178 h->transid = root->fs_info->running_transaction->transid;
179 h->transaction = root->fs_info->running_transaction;
180 h->blocks_reserved = num_blocks;
181 h->blocks_used = 0;
182 h->block_group = 0;
183 h->alloc_exclude_nr = 0;
184 h->alloc_exclude_start = 0;
185 root->fs_info->running_transaction->use_count++;
186 mutex_unlock(&root->fs_info->trans_mutex);
187 return h;
188}
189
190struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
191 int num_blocks)
192{
193 return start_transaction(root, num_blocks, 1);
194}
195struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
196 int num_blocks)
197{
198 return start_transaction(root, num_blocks, 0);
199}
200
201struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
202 int num_blocks)
203{
204 return start_transaction(r, num_blocks, 2);
205}
206
207/* wait for a transaction commit to be fully complete */
208static noinline int wait_for_commit(struct btrfs_root *root,
209 struct btrfs_transaction *commit)
210{
211 DEFINE_WAIT(wait);
212 mutex_lock(&root->fs_info->trans_mutex);
213 while (!commit->commit_done) {
214 prepare_to_wait(&commit->commit_wait, &wait,
215 TASK_UNINTERRUPTIBLE);
216 if (commit->commit_done)
217 break;
218 mutex_unlock(&root->fs_info->trans_mutex);
219 schedule();
220 mutex_lock(&root->fs_info->trans_mutex);
221 }
222 mutex_unlock(&root->fs_info->trans_mutex);
223 finish_wait(&commit->commit_wait, &wait);
224 return 0;
225}
226
227/*
228 * rate limit against the drop_snapshot code. This helps to slow down new
229 * operations if the drop_snapshot code isn't able to keep up.
230 */
231static void throttle_on_drops(struct btrfs_root *root)
232{
233 struct btrfs_fs_info *info = root->fs_info;
234 int harder_count = 0;
235
236harder:
237 if (atomic_read(&info->throttles)) {
238 DEFINE_WAIT(wait);
239 int thr;
240 thr = atomic_read(&info->throttle_gen);
241
242 do {
243 prepare_to_wait(&info->transaction_throttle,
244 &wait, TASK_UNINTERRUPTIBLE);
245 if (!atomic_read(&info->throttles)) {
246 finish_wait(&info->transaction_throttle, &wait);
247 break;
248 }
249 schedule();
250 finish_wait(&info->transaction_throttle, &wait);
251 } while (thr == atomic_read(&info->throttle_gen));
252 harder_count++;
253
254 if (root->fs_info->total_ref_cache_size > 1 * 1024 * 1024 &&
255 harder_count < 2)
256 goto harder;
257
258 if (root->fs_info->total_ref_cache_size > 5 * 1024 * 1024 &&
259 harder_count < 10)
260 goto harder;
261
262 if (root->fs_info->total_ref_cache_size > 10 * 1024 * 1024 &&
263 harder_count < 20)
264 goto harder;
265 }
266}
267
268void btrfs_throttle(struct btrfs_root *root)
269{
270 mutex_lock(&root->fs_info->trans_mutex);
271 if (!root->fs_info->open_ioctl_trans)
272 wait_current_trans(root);
273 mutex_unlock(&root->fs_info->trans_mutex);
274
275 throttle_on_drops(root);
276}
277
278static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
279 struct btrfs_root *root, int throttle)
280{
281 struct btrfs_transaction *cur_trans;
282 struct btrfs_fs_info *info = root->fs_info;
283
284 mutex_lock(&info->trans_mutex);
285 cur_trans = info->running_transaction;
286 WARN_ON(cur_trans != trans->transaction);
287 WARN_ON(cur_trans->num_writers < 1);
288 cur_trans->num_writers--;
289
290 if (waitqueue_active(&cur_trans->writer_wait))
291 wake_up(&cur_trans->writer_wait);
292 put_transaction(cur_trans);
293 mutex_unlock(&info->trans_mutex);
294 memset(trans, 0, sizeof(*trans));
295 kmem_cache_free(btrfs_trans_handle_cachep, trans);
296
297 if (throttle)
298 throttle_on_drops(root);
299
300 return 0;
301}
302
303int btrfs_end_transaction(struct btrfs_trans_handle *trans,
304 struct btrfs_root *root)
305{
306 return __btrfs_end_transaction(trans, root, 0);
307}
308
309int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
310 struct btrfs_root *root)
311{
312 return __btrfs_end_transaction(trans, root, 1);
313}
314
315/*
316 * when btree blocks are allocated, they have some corresponding bits set for
317 * them in one of two extent_io trees. This is used to make sure all of
318 * those extents are on disk for transaction or log commit
319 */
320int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
321 struct extent_io_tree *dirty_pages)
322{
323 int ret;
324 int err = 0;
325 int werr = 0;
326 struct page *page;
327 struct inode *btree_inode = root->fs_info->btree_inode;
328 u64 start = 0;
329 u64 end;
330 unsigned long index;
331
332 while (1) {
333 ret = find_first_extent_bit(dirty_pages, start, &start, &end,
334 EXTENT_DIRTY);
335 if (ret)
336 break;
337 while (start <= end) {
338 cond_resched();
339
340 index = start >> PAGE_CACHE_SHIFT;
341 start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
342 page = find_get_page(btree_inode->i_mapping, index);
343 if (!page)
344 continue;
345
346 btree_lock_page_hook(page);
347 if (!page->mapping) {
348 unlock_page(page);
349 page_cache_release(page);
350 continue;
351 }
352
353 if (PageWriteback(page)) {
354 if (PageDirty(page))
355 wait_on_page_writeback(page);
356 else {
357 unlock_page(page);
358 page_cache_release(page);
359 continue;
360 }
361 }
362 err = write_one_page(page, 0);
363 if (err)
364 werr = err;
365 page_cache_release(page);
366 }
367 }
368 while (1) {
369 ret = find_first_extent_bit(dirty_pages, 0, &start, &end,
370 EXTENT_DIRTY);
371 if (ret)
372 break;
373
374 clear_extent_dirty(dirty_pages, start, end, GFP_NOFS);
375 while (start <= end) {
376 index = start >> PAGE_CACHE_SHIFT;
377 start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
378 page = find_get_page(btree_inode->i_mapping, index);
379 if (!page)
380 continue;
381 if (PageDirty(page)) {
382 btree_lock_page_hook(page);
383 wait_on_page_writeback(page);
384 err = write_one_page(page, 0);
385 if (err)
386 werr = err;
387 }
388 wait_on_page_writeback(page);
389 page_cache_release(page);
390 cond_resched();
391 }
392 }
393 if (err)
394 werr = err;
395 return werr;
396}
397
398int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
399 struct btrfs_root *root)
400{
401 if (!trans || !trans->transaction) {
402 struct inode *btree_inode;
403 btree_inode = root->fs_info->btree_inode;
404 return filemap_write_and_wait(btree_inode->i_mapping);
405 }
406 return btrfs_write_and_wait_marked_extents(root,
407 &trans->transaction->dirty_pages);
408}
409
410/*
411 * this is used to update the root pointer in the tree of tree roots.
412 *
413 * But, in the case of the extent allocation tree, updating the root
414 * pointer may allocate blocks which may change the root of the extent
415 * allocation tree.
416 *
417 * So, this loops and repeats and makes sure the cowonly root didn't
418 * change while the root pointer was being updated in the metadata.
419 */
420static int update_cowonly_root(struct btrfs_trans_handle *trans,
421 struct btrfs_root *root)
422{
423 int ret;
424 u64 old_root_bytenr;
425 struct btrfs_root *tree_root = root->fs_info->tree_root;
426
427 btrfs_extent_post_op(trans, root);
428 btrfs_write_dirty_block_groups(trans, root);
429 btrfs_extent_post_op(trans, root);
430
431 while (1) {
432 old_root_bytenr = btrfs_root_bytenr(&root->root_item);
433 if (old_root_bytenr == root->node->start)
434 break;
435 btrfs_set_root_bytenr(&root->root_item,
436 root->node->start);
437 btrfs_set_root_level(&root->root_item,
438 btrfs_header_level(root->node));
439 btrfs_set_root_generation(&root->root_item, trans->transid);
440
441 btrfs_extent_post_op(trans, root);
442
443 ret = btrfs_update_root(trans, tree_root,
444 &root->root_key,
445 &root->root_item);
446 BUG_ON(ret);
447 btrfs_write_dirty_block_groups(trans, root);
448 btrfs_extent_post_op(trans, root);
449 }
450 return 0;
451}
452
453/*
454 * update all the cowonly tree roots on disk
455 */
456int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
457 struct btrfs_root *root)
458{
459 struct btrfs_fs_info *fs_info = root->fs_info;
460 struct list_head *next;
461 struct extent_buffer *eb;
462
463 btrfs_extent_post_op(trans, fs_info->tree_root);
464
465 eb = btrfs_lock_root_node(fs_info->tree_root);
466 btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb, 0);
467 btrfs_tree_unlock(eb);
468 free_extent_buffer(eb);
469
470 btrfs_extent_post_op(trans, fs_info->tree_root);
471
472 while (!list_empty(&fs_info->dirty_cowonly_roots)) {
473 next = fs_info->dirty_cowonly_roots.next;
474 list_del_init(next);
475 root = list_entry(next, struct btrfs_root, dirty_list);
476
477 update_cowonly_root(trans, root);
478 }
479 return 0;
480}
481
482/*
483 * dead roots are old snapshots that need to be deleted. This allocates
484 * a dirty root struct and adds it into the list of dead roots that need to
485 * be deleted
486 */
487int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest)
488{
489 struct btrfs_dirty_root *dirty;
490
491 dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
492 if (!dirty)
493 return -ENOMEM;
494 dirty->root = root;
495 dirty->latest_root = latest;
496
497 mutex_lock(&root->fs_info->trans_mutex);
498 list_add(&dirty->list, &latest->fs_info->dead_roots);
499 mutex_unlock(&root->fs_info->trans_mutex);
500 return 0;
501}
502
503/*
504 * at transaction commit time we need to schedule the old roots for
505 * deletion via btrfs_drop_snapshot. This runs through all the
506 * reference counted roots that were modified in the current
507 * transaction and puts them into the drop list
508 */
509static noinline int add_dirty_roots(struct btrfs_trans_handle *trans,
510 struct radix_tree_root *radix,
511 struct list_head *list)
512{
513 struct btrfs_dirty_root *dirty;
514 struct btrfs_root *gang[8];
515 struct btrfs_root *root;
516 int i;
517 int ret;
518 int err = 0;
519 u32 refs;
520
521 while (1) {
522 ret = radix_tree_gang_lookup_tag(radix, (void **)gang, 0,
523 ARRAY_SIZE(gang),
524 BTRFS_ROOT_TRANS_TAG);
525 if (ret == 0)
526 break;
527 for (i = 0; i < ret; i++) {
528 root = gang[i];
529 radix_tree_tag_clear(radix,
530 (unsigned long)root->root_key.objectid,
531 BTRFS_ROOT_TRANS_TAG);
532
533 BUG_ON(!root->ref_tree);
534 dirty = root->dirty_root;
535
536 btrfs_free_log(trans, root);
537 btrfs_free_reloc_root(trans, root);
538
539 if (root->commit_root == root->node) {
540 WARN_ON(root->node->start !=
541 btrfs_root_bytenr(&root->root_item));
542
543 free_extent_buffer(root->commit_root);
544 root->commit_root = NULL;
545 root->dirty_root = NULL;
546
547 spin_lock(&root->list_lock);
548 list_del_init(&dirty->root->dead_list);
549 spin_unlock(&root->list_lock);
550
551 kfree(dirty->root);
552 kfree(dirty);
553
554 /* make sure to update the root on disk
555 * so we get any updates to the block used
556 * counts
557 */
558 err = btrfs_update_root(trans,
559 root->fs_info->tree_root,
560 &root->root_key,
561 &root->root_item);
562 continue;
563 }
564
565 memset(&root->root_item.drop_progress, 0,
566 sizeof(struct btrfs_disk_key));
567 root->root_item.drop_level = 0;
568 root->commit_root = NULL;
569 root->dirty_root = NULL;
570 root->root_key.offset = root->fs_info->generation;
571 btrfs_set_root_bytenr(&root->root_item,
572 root->node->start);
573 btrfs_set_root_level(&root->root_item,
574 btrfs_header_level(root->node));
575 btrfs_set_root_generation(&root->root_item,
576 root->root_key.offset);
577
578 err = btrfs_insert_root(trans, root->fs_info->tree_root,
579 &root->root_key,
580 &root->root_item);
581 if (err)
582 break;
583
584 refs = btrfs_root_refs(&dirty->root->root_item);
585 btrfs_set_root_refs(&dirty->root->root_item, refs - 1);
586 err = btrfs_update_root(trans, root->fs_info->tree_root,
587 &dirty->root->root_key,
588 &dirty->root->root_item);
589
590 BUG_ON(err);
591 if (refs == 1) {
592 list_add(&dirty->list, list);
593 } else {
594 WARN_ON(1);
595 free_extent_buffer(dirty->root->node);
596 kfree(dirty->root);
597 kfree(dirty);
598 }
599 }
600 }
601 return err;
602}
603
604/*
605 * defrag a given btree. If cacheonly == 1, this won't read from the disk,
606 * otherwise every leaf in the btree is read and defragged.
607 */
608int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
609{
610 struct btrfs_fs_info *info = root->fs_info;
611 int ret;
612 struct btrfs_trans_handle *trans;
613 unsigned long nr;
614
615 smp_mb();
616 if (root->defrag_running)
617 return 0;
618 trans = btrfs_start_transaction(root, 1);
619 while (1) {
620 root->defrag_running = 1;
621 ret = btrfs_defrag_leaves(trans, root, cacheonly);
622 nr = trans->blocks_used;
623 btrfs_end_transaction(trans, root);
624 btrfs_btree_balance_dirty(info->tree_root, nr);
625 cond_resched();
626
627 trans = btrfs_start_transaction(root, 1);
628 if (root->fs_info->closing || ret != -EAGAIN)
629 break;
630 }
631 root->defrag_running = 0;
632 smp_mb();
633 btrfs_end_transaction(trans, root);
634 return 0;
635}
636
637/*
638 * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on
639 * all of them
640 */
641static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
642 struct list_head *list)
643{
644 struct btrfs_dirty_root *dirty;
645 struct btrfs_trans_handle *trans;
646 unsigned long nr;
647 u64 num_bytes;
648 u64 bytes_used;
649 u64 max_useless;
650 int ret = 0;
651 int err;
652
653 while (!list_empty(list)) {
654 struct btrfs_root *root;
655
656 dirty = list_entry(list->prev, struct btrfs_dirty_root, list);
657 list_del_init(&dirty->list);
658
659 num_bytes = btrfs_root_used(&dirty->root->root_item);
660 root = dirty->latest_root;
661 atomic_inc(&root->fs_info->throttles);
662
663 while (1) {
664 trans = btrfs_start_transaction(tree_root, 1);
665 mutex_lock(&root->fs_info->drop_mutex);
666 ret = btrfs_drop_snapshot(trans, dirty->root);
667 if (ret != -EAGAIN)
668 break;
669 mutex_unlock(&root->fs_info->drop_mutex);
670
671 err = btrfs_update_root(trans,
672 tree_root,
673 &dirty->root->root_key,
674 &dirty->root->root_item);
675 if (err)
676 ret = err;
677 nr = trans->blocks_used;
678 ret = btrfs_end_transaction(trans, tree_root);
679 BUG_ON(ret);
680
681 btrfs_btree_balance_dirty(tree_root, nr);
682 cond_resched();
683 }
684 BUG_ON(ret);
685 atomic_dec(&root->fs_info->throttles);
686 wake_up(&root->fs_info->transaction_throttle);
687
688 num_bytes -= btrfs_root_used(&dirty->root->root_item);
689 bytes_used = btrfs_root_used(&root->root_item);
690 if (num_bytes) {
691 btrfs_record_root_in_trans(root);
692 btrfs_set_root_used(&root->root_item,
693 bytes_used - num_bytes);
694 }
695
696 ret = btrfs_del_root(trans, tree_root, &dirty->root->root_key);
697 if (ret) {
698 BUG();
699 break;
700 }
701 mutex_unlock(&root->fs_info->drop_mutex);
702
703 spin_lock(&root->list_lock);
704 list_del_init(&dirty->root->dead_list);
705 if (!list_empty(&root->dead_list)) {
706 struct btrfs_root *oldest;
707 oldest = list_entry(root->dead_list.prev,
708 struct btrfs_root, dead_list);
709 max_useless = oldest->root_key.offset - 1;
710 } else {
711 max_useless = root->root_key.offset - 1;
712 }
713 spin_unlock(&root->list_lock);
714
715 nr = trans->blocks_used;
716 ret = btrfs_end_transaction(trans, tree_root);
717 BUG_ON(ret);
718
719 ret = btrfs_remove_leaf_refs(root, max_useless, 0);
720 BUG_ON(ret);
721
722 free_extent_buffer(dirty->root->node);
723 kfree(dirty->root);
724 kfree(dirty);
725
726 btrfs_btree_balance_dirty(tree_root, nr);
727 cond_resched();
728 }
729 return ret;
730}
731
732/*
733 * new snapshots need to be created at a very specific time in the
734 * transaction commit. This does the actual creation
735 */
736static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
737 struct btrfs_fs_info *fs_info,
738 struct btrfs_pending_snapshot *pending)
739{
740 struct btrfs_key key;
741 struct btrfs_root_item *new_root_item;
742 struct btrfs_root *tree_root = fs_info->tree_root;
743 struct btrfs_root *root = pending->root;
744 struct extent_buffer *tmp;
745 struct extent_buffer *old;
746 int ret;
747 u64 objectid;
748
749 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
750 if (!new_root_item) {
751 ret = -ENOMEM;
752 goto fail;
753 }
754 ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid);
755 if (ret)
756 goto fail;
757
758 btrfs_record_root_in_trans(root);
759 btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
760 memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
761
762 key.objectid = objectid;
763 key.offset = trans->transid;
764 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
765
766 old = btrfs_lock_root_node(root);
767 btrfs_cow_block(trans, root, old, NULL, 0, &old, 0);
768
769 btrfs_copy_root(trans, root, old, &tmp, objectid);
770 btrfs_tree_unlock(old);
771 free_extent_buffer(old);
772
773 btrfs_set_root_bytenr(new_root_item, tmp->start);
774 btrfs_set_root_level(new_root_item, btrfs_header_level(tmp));
775 btrfs_set_root_generation(new_root_item, trans->transid);
776 ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
777 new_root_item);
778 btrfs_tree_unlock(tmp);
779 free_extent_buffer(tmp);
780 if (ret)
781 goto fail;
782
783 key.offset = (u64)-1;
784 memcpy(&pending->root_key, &key, sizeof(key));
785fail:
786 kfree(new_root_item);
787 return ret;
788}
789
790static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info,
791 struct btrfs_pending_snapshot *pending)
792{
793 int ret;
794 int namelen;
795 u64 index = 0;
796 struct btrfs_trans_handle *trans;
797 struct inode *parent_inode;
798 struct inode *inode;
799 struct btrfs_root *parent_root;
800
801 parent_inode = pending->dentry->d_parent->d_inode;
802 parent_root = BTRFS_I(parent_inode)->root;
803 trans = btrfs_join_transaction(parent_root, 1);
804
805 /*
806 * insert the directory item
807 */
808 namelen = strlen(pending->name);
809 ret = btrfs_set_inode_index(parent_inode, &index);
810 ret = btrfs_insert_dir_item(trans, parent_root,
811 pending->name, namelen,
812 parent_inode->i_ino,
813 &pending->root_key, BTRFS_FT_DIR, index);
814
815 if (ret)
816 goto fail;
817
818 btrfs_i_size_write(parent_inode, parent_inode->i_size + namelen * 2);
819 ret = btrfs_update_inode(trans, parent_root, parent_inode);
820 BUG_ON(ret);
821
822 /* add the backref first */
823 ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
824 pending->root_key.objectid,
825 BTRFS_ROOT_BACKREF_KEY,
826 parent_root->root_key.objectid,
827 parent_inode->i_ino, index, pending->name,
828 namelen);
829
830 BUG_ON(ret);
831
832 /* now add the forward ref */
833 ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
834 parent_root->root_key.objectid,
835 BTRFS_ROOT_REF_KEY,
836 pending->root_key.objectid,
837 parent_inode->i_ino, index, pending->name,
838 namelen);
839
840 inode = btrfs_lookup_dentry(parent_inode, pending->dentry);
841 d_instantiate(pending->dentry, inode);
842fail:
843 btrfs_end_transaction(trans, fs_info->fs_root);
844 return ret;
845}
846
847/*
848 * create all the snapshots we've scheduled for creation
849 */
850static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
851 struct btrfs_fs_info *fs_info)
852{
853 struct btrfs_pending_snapshot *pending;
854 struct list_head *head = &trans->transaction->pending_snapshots;
855 struct list_head *cur;
856 int ret;
857
858 list_for_each(cur, head) {
859 pending = list_entry(cur, struct btrfs_pending_snapshot, list);
860 ret = create_pending_snapshot(trans, fs_info, pending);
861 BUG_ON(ret);
862 }
863 return 0;
864}
865
866static noinline int finish_pending_snapshots(struct btrfs_trans_handle *trans,
867 struct btrfs_fs_info *fs_info)
868{
869 struct btrfs_pending_snapshot *pending;
870 struct list_head *head = &trans->transaction->pending_snapshots;
871 int ret;
872
873 while (!list_empty(head)) {
874 pending = list_entry(head->next,
875 struct btrfs_pending_snapshot, list);
876 ret = finish_pending_snapshot(fs_info, pending);
877 BUG_ON(ret);
878 list_del(&pending->list);
879 kfree(pending->name);
880 kfree(pending);
881 }
882 return 0;
883}
884
885int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
886 struct btrfs_root *root)
887{
888 unsigned long joined = 0;
889 unsigned long timeout = 1;
890 struct btrfs_transaction *cur_trans;
891 struct btrfs_transaction *prev_trans = NULL;
892 struct btrfs_root *chunk_root = root->fs_info->chunk_root;
893 struct list_head dirty_fs_roots;
894 struct extent_io_tree *pinned_copy;
895 DEFINE_WAIT(wait);
896 int ret;
897
898 INIT_LIST_HEAD(&dirty_fs_roots);
899 mutex_lock(&root->fs_info->trans_mutex);
900 if (trans->transaction->in_commit) {
901 cur_trans = trans->transaction;
902 trans->transaction->use_count++;
903 mutex_unlock(&root->fs_info->trans_mutex);
904 btrfs_end_transaction(trans, root);
905
906 ret = wait_for_commit(root, cur_trans);
907 BUG_ON(ret);
908
909 mutex_lock(&root->fs_info->trans_mutex);
910 put_transaction(cur_trans);
911 mutex_unlock(&root->fs_info->trans_mutex);
912
913 return 0;
914 }
915
916 pinned_copy = kmalloc(sizeof(*pinned_copy), GFP_NOFS);
917 if (!pinned_copy)
918 return -ENOMEM;
919
920 extent_io_tree_init(pinned_copy,
921 root->fs_info->btree_inode->i_mapping, GFP_NOFS);
922
923 trans->transaction->in_commit = 1;
924 trans->transaction->blocked = 1;
925 cur_trans = trans->transaction;
926 if (cur_trans->list.prev != &root->fs_info->trans_list) {
927 prev_trans = list_entry(cur_trans->list.prev,
928 struct btrfs_transaction, list);
929 if (!prev_trans->commit_done) {
930 prev_trans->use_count++;
931 mutex_unlock(&root->fs_info->trans_mutex);
932
933 wait_for_commit(root, prev_trans);
934
935 mutex_lock(&root->fs_info->trans_mutex);
936 put_transaction(prev_trans);
937 }
938 }
939
940 do {
941 int snap_pending = 0;
942 joined = cur_trans->num_joined;
943 if (!list_empty(&trans->transaction->pending_snapshots))
944 snap_pending = 1;
945
946 WARN_ON(cur_trans != trans->transaction);
947 prepare_to_wait(&cur_trans->writer_wait, &wait,
948 TASK_UNINTERRUPTIBLE);
949
950 if (cur_trans->num_writers > 1)
951 timeout = MAX_SCHEDULE_TIMEOUT;
952 else
953 timeout = 1;
954
955 mutex_unlock(&root->fs_info->trans_mutex);
956
957 if (snap_pending) {
958 ret = btrfs_wait_ordered_extents(root, 1);
959 BUG_ON(ret);
960 }
961
962 schedule_timeout(timeout);
963
964 mutex_lock(&root->fs_info->trans_mutex);
965 finish_wait(&cur_trans->writer_wait, &wait);
966 } while (cur_trans->num_writers > 1 ||
967 (cur_trans->num_joined != joined));
968
969 ret = create_pending_snapshots(trans, root->fs_info);
970 BUG_ON(ret);
971
972 WARN_ON(cur_trans != trans->transaction);
973
974 /* btrfs_commit_tree_roots is responsible for getting the
975 * various roots consistent with each other. Every pointer
976 * in the tree of tree roots has to point to the most up to date
977 * root for every subvolume and other tree. So, we have to keep
978 * the tree logging code from jumping in and changing any
979 * of the trees.
980 *
981 * At this point in the commit, there can't be any tree-log
982 * writers, but a little lower down we drop the trans mutex
983 * and let new people in. By holding the tree_log_mutex
984 * from now until after the super is written, we avoid races
985 * with the tree-log code.
986 */
987 mutex_lock(&root->fs_info->tree_log_mutex);
988 /*
989 * keep tree reloc code from adding new reloc trees
990 */
991 mutex_lock(&root->fs_info->tree_reloc_mutex);
992
993
994 ret = add_dirty_roots(trans, &root->fs_info->fs_roots_radix,
995 &dirty_fs_roots);
996 BUG_ON(ret);
997
998 /* add_dirty_roots gets rid of all the tree log roots, it is now
999 * safe to free the root of tree log roots
1000 */
1001 btrfs_free_log_root_tree(trans, root->fs_info);
1002
1003 ret = btrfs_commit_tree_roots(trans, root);
1004 BUG_ON(ret);
1005
1006 cur_trans = root->fs_info->running_transaction;
1007 spin_lock(&root->fs_info->new_trans_lock);
1008 root->fs_info->running_transaction = NULL;
1009 spin_unlock(&root->fs_info->new_trans_lock);
1010 btrfs_set_super_generation(&root->fs_info->super_copy,
1011 cur_trans->transid);
1012 btrfs_set_super_root(&root->fs_info->super_copy,
1013 root->fs_info->tree_root->node->start);
1014 btrfs_set_super_root_level(&root->fs_info->super_copy,
1015 btrfs_header_level(root->fs_info->tree_root->node));
1016
1017 btrfs_set_super_chunk_root(&root->fs_info->super_copy,
1018 chunk_root->node->start);
1019 btrfs_set_super_chunk_root_level(&root->fs_info->super_copy,
1020 btrfs_header_level(chunk_root->node));
1021 btrfs_set_super_chunk_root_generation(&root->fs_info->super_copy,
1022 btrfs_header_generation(chunk_root->node));
1023
1024 if (!root->fs_info->log_root_recovering) {
1025 btrfs_set_super_log_root(&root->fs_info->super_copy, 0);
1026 btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0);
1027 }
1028
1029 memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy,
1030 sizeof(root->fs_info->super_copy));
1031
1032 btrfs_copy_pinned(root, pinned_copy);
1033
1034 trans->transaction->blocked = 0;
1035 wake_up(&root->fs_info->transaction_throttle);
1036 wake_up(&root->fs_info->transaction_wait);
1037
1038 mutex_unlock(&root->fs_info->trans_mutex);
1039 ret = btrfs_write_and_wait_transaction(trans, root);
1040 BUG_ON(ret);
1041 write_ctree_super(trans, root, 0);
1042
1043 /*
1044 * the super is written, we can safely allow the tree-loggers
1045 * to go about their business
1046 */
1047 mutex_unlock(&root->fs_info->tree_log_mutex);
1048
1049 btrfs_finish_extent_commit(trans, root, pinned_copy);
1050 kfree(pinned_copy);
1051
1052 btrfs_drop_dead_reloc_roots(root);
1053 mutex_unlock(&root->fs_info->tree_reloc_mutex);
1054
1055 /* do the directory inserts of any pending snapshot creations */
1056 finish_pending_snapshots(trans, root->fs_info);
1057
1058 mutex_lock(&root->fs_info->trans_mutex);
1059
1060 cur_trans->commit_done = 1;
1061 root->fs_info->last_trans_committed = cur_trans->transid;
1062 wake_up(&cur_trans->commit_wait);
1063
1064 put_transaction(cur_trans);
1065 put_transaction(cur_trans);
1066
1067 list_splice_init(&dirty_fs_roots, &root->fs_info->dead_roots);
1068 if (root->fs_info->closing)
1069 list_splice_init(&root->fs_info->dead_roots, &dirty_fs_roots);
1070
1071 mutex_unlock(&root->fs_info->trans_mutex);
1072
1073 kmem_cache_free(btrfs_trans_handle_cachep, trans);
1074
1075 if (root->fs_info->closing)
1076 drop_dirty_roots(root->fs_info->tree_root, &dirty_fs_roots);
1077 return ret;
1078}
1079
1080/*
1081 * interface function to delete all the snapshots we have scheduled for deletion
1082 */
1083int btrfs_clean_old_snapshots(struct btrfs_root *root)
1084{
1085 struct list_head dirty_roots;
1086 INIT_LIST_HEAD(&dirty_roots);
1087again:
1088 mutex_lock(&root->fs_info->trans_mutex);
1089 list_splice_init(&root->fs_info->dead_roots, &dirty_roots);
1090 mutex_unlock(&root->fs_info->trans_mutex);
1091
1092 if (!list_empty(&dirty_roots)) {
1093 drop_dirty_roots(root, &dirty_roots);
1094 goto again;
1095 }
1096 return 0;
1097}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
new file mode 100644
index 000000000000..ea292117f882
--- /dev/null
+++ b/fs/btrfs/transaction.h
@@ -0,0 +1,106 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_TRANSACTION__
20#define __BTRFS_TRANSACTION__
21#include "btrfs_inode.h"
22
23struct btrfs_transaction {
24 u64 transid;
25 unsigned long num_writers;
26 unsigned long num_joined;
27 int in_commit;
28 int use_count;
29 int commit_done;
30 int blocked;
31 struct list_head list;
32 struct extent_io_tree dirty_pages;
33 unsigned long start_time;
34 wait_queue_head_t writer_wait;
35 wait_queue_head_t commit_wait;
36 struct list_head pending_snapshots;
37};
38
39struct btrfs_trans_handle {
40 u64 transid;
41 unsigned long blocks_reserved;
42 unsigned long blocks_used;
43 struct btrfs_transaction *transaction;
44 u64 block_group;
45 u64 alloc_exclude_start;
46 u64 alloc_exclude_nr;
47};
48
49struct btrfs_pending_snapshot {
50 struct dentry *dentry;
51 struct btrfs_root *root;
52 char *name;
53 struct btrfs_key root_key;
54 struct list_head list;
55};
56
57struct btrfs_dirty_root {
58 struct list_head list;
59 struct btrfs_root *root;
60 struct btrfs_root *latest_root;
61};
62
63static inline void btrfs_set_trans_block_group(struct btrfs_trans_handle *trans,
64 struct inode *inode)
65{
66 trans->block_group = BTRFS_I(inode)->block_group;
67}
68
69static inline void btrfs_update_inode_block_group(
70 struct btrfs_trans_handle *trans,
71 struct inode *inode)
72{
73 BTRFS_I(inode)->block_group = trans->block_group;
74}
75
76static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
77 struct inode *inode)
78{
79 BTRFS_I(inode)->last_trans = trans->transaction->transid;
80}
81
82int btrfs_end_transaction(struct btrfs_trans_handle *trans,
83 struct btrfs_root *root);
84struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
85 int num_blocks);
86struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
87 int num_blocks);
88struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
89 int num_blocks);
90int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
91 struct btrfs_root *root);
92int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
93 struct btrfs_root *root);
94
95int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest);
96int btrfs_defrag_root(struct btrfs_root *root, int cacheonly);
97int btrfs_clean_old_snapshots(struct btrfs_root *root);
98int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
99 struct btrfs_root *root);
100int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
101 struct btrfs_root *root);
102void btrfs_throttle(struct btrfs_root *root);
103int btrfs_record_root_in_trans(struct btrfs_root *root);
104int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
105 struct extent_io_tree *dirty_pages);
106#endif
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
new file mode 100644
index 000000000000..3e8358c36165
--- /dev/null
+++ b/fs/btrfs/tree-defrag.c
@@ -0,0 +1,147 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include "ctree.h"
21#include "disk-io.h"
22#include "print-tree.h"
23#include "transaction.h"
24#include "locking.h"
25
26/* defrag all the leaves in a given btree. If cache_only == 1, don't read
27 * things from disk, otherwise read all the leaves and try to get key order to
28 * better reflect disk order
29 */
30
31int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
32 struct btrfs_root *root, int cache_only)
33{
34 struct btrfs_path *path = NULL;
35 struct btrfs_key key;
36 int ret = 0;
37 int wret;
38 int level;
39 int orig_level;
40 int is_extent = 0;
41 int next_key_ret = 0;
42 u64 last_ret = 0;
43 u64 min_trans = 0;
44
45 if (cache_only)
46 goto out;
47
48 if (root->fs_info->extent_root == root) {
49 /*
50 * there's recursion here right now in the tree locking,
51 * we can't defrag the extent root without deadlock
52 */
53 goto out;
54 }
55
56 if (root->ref_cows == 0 && !is_extent)
57 goto out;
58
59 if (btrfs_test_opt(root, SSD))
60 goto out;
61
62 path = btrfs_alloc_path();
63 if (!path)
64 return -ENOMEM;
65
66 level = btrfs_header_level(root->node);
67 orig_level = level;
68
69 if (level == 0)
70 goto out;
71
72 if (root->defrag_progress.objectid == 0) {
73 struct extent_buffer *root_node;
74 u32 nritems;
75
76 root_node = btrfs_lock_root_node(root);
77 nritems = btrfs_header_nritems(root_node);
78 root->defrag_max.objectid = 0;
79 /* from above we know this is not a leaf */
80 btrfs_node_key_to_cpu(root_node, &root->defrag_max,
81 nritems - 1);
82 btrfs_tree_unlock(root_node);
83 free_extent_buffer(root_node);
84 memset(&key, 0, sizeof(key));
85 } else {
86 memcpy(&key, &root->defrag_progress, sizeof(key));
87 }
88
89 path->keep_locks = 1;
90 if (cache_only)
91 min_trans = root->defrag_trans_start;
92
93 ret = btrfs_search_forward(root, &key, NULL, path,
94 cache_only, min_trans);
95 if (ret < 0)
96 goto out;
97 if (ret > 0) {
98 ret = 0;
99 goto out;
100 }
101 btrfs_release_path(root, path);
102 wret = btrfs_search_slot(trans, root, &key, path, 0, 1);
103
104 if (wret < 0) {
105 ret = wret;
106 goto out;
107 }
108 if (!path->nodes[1]) {
109 ret = 0;
110 goto out;
111 }
112 path->slots[1] = btrfs_header_nritems(path->nodes[1]);
113 next_key_ret = btrfs_find_next_key(root, path, &key, 1, cache_only,
114 min_trans);
115 ret = btrfs_realloc_node(trans, root,
116 path->nodes[1], 0,
117 cache_only, &last_ret,
118 &root->defrag_progress);
119 WARN_ON(ret && ret != -EAGAIN);
120 if (next_key_ret == 0) {
121 memcpy(&root->defrag_progress, &key, sizeof(key));
122 ret = -EAGAIN;
123 }
124
125 btrfs_release_path(root, path);
126 if (is_extent)
127 btrfs_extent_post_op(trans, root);
128out:
129 if (path)
130 btrfs_free_path(path);
131 if (ret == -EAGAIN) {
132 if (root->defrag_max.objectid > root->defrag_progress.objectid)
133 goto done;
134 if (root->defrag_max.type > root->defrag_progress.type)
135 goto done;
136 if (root->defrag_max.offset > root->defrag_progress.offset)
137 goto done;
138 ret = 0;
139 }
140done:
141 if (ret != -EAGAIN) {
142 memset(&root->defrag_progress, 0,
143 sizeof(root->defrag_progress));
144 root->defrag_trans_start = trans->transid;
145 }
146 return ret;
147}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
new file mode 100644
index 000000000000..d81cda2e077c
--- /dev/null
+++ b/fs/btrfs/tree-log.c
@@ -0,0 +1,2898 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include "ctree.h"
21#include "transaction.h"
22#include "disk-io.h"
23#include "locking.h"
24#include "print-tree.h"
25#include "compat.h"
26#include "tree-log.h"
27
28/* magic values for the inode_only field in btrfs_log_inode:
29 *
30 * LOG_INODE_ALL means to log everything
31 * LOG_INODE_EXISTS means to log just enough to recreate the inode
32 * during log replay
33 */
34#define LOG_INODE_ALL 0
35#define LOG_INODE_EXISTS 1
36
37/*
38 * stages for the tree walking. The first
39 * stage (0) is to only pin down the blocks we find
40 * the second stage (1) is to make sure that all the inodes
41 * we find in the log are created in the subvolume.
42 *
43 * The last stage is to deal with directories and links and extents
44 * and all the other fun semantics
45 */
46#define LOG_WALK_PIN_ONLY 0
47#define LOG_WALK_REPLAY_INODES 1
48#define LOG_WALK_REPLAY_ALL 2
49
50static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
51 struct btrfs_root *root, struct inode *inode,
52 int inode_only);
53static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
54 struct btrfs_root *root,
55 struct btrfs_path *path, u64 objectid);
56
57/*
58 * tree logging is a special write ahead log used to make sure that
59 * fsyncs and O_SYNCs can happen without doing full tree commits.
60 *
61 * Full tree commits are expensive because they require commonly
62 * modified blocks to be recowed, creating many dirty pages in the
63 * extent tree an 4x-6x higher write load than ext3.
64 *
65 * Instead of doing a tree commit on every fsync, we use the
66 * key ranges and transaction ids to find items for a given file or directory
67 * that have changed in this transaction. Those items are copied into
68 * a special tree (one per subvolume root), that tree is written to disk
69 * and then the fsync is considered complete.
70 *
71 * After a crash, items are copied out of the log-tree back into the
72 * subvolume tree. Any file data extents found are recorded in the extent
73 * allocation tree, and the log-tree freed.
74 *
75 * The log tree is read three times, once to pin down all the extents it is
76 * using in ram and once, once to create all the inodes logged in the tree
77 * and once to do all the other items.
78 */
79
80/*
81 * btrfs_add_log_tree adds a new per-subvolume log tree into the
82 * tree of log tree roots. This must be called with a tree log transaction
83 * running (see start_log_trans).
84 */
85static int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
86 struct btrfs_root *root)
87{
88 struct btrfs_key key;
89 struct btrfs_root_item root_item;
90 struct btrfs_inode_item *inode_item;
91 struct extent_buffer *leaf;
92 struct btrfs_root *new_root = root;
93 int ret;
94 u64 objectid = root->root_key.objectid;
95
96 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
97 BTRFS_TREE_LOG_OBJECTID,
98 trans->transid, 0, 0, 0);
99 if (IS_ERR(leaf)) {
100 ret = PTR_ERR(leaf);
101 return ret;
102 }
103
104 btrfs_set_header_nritems(leaf, 0);
105 btrfs_set_header_level(leaf, 0);
106 btrfs_set_header_bytenr(leaf, leaf->start);
107 btrfs_set_header_generation(leaf, trans->transid);
108 btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID);
109
110 write_extent_buffer(leaf, root->fs_info->fsid,
111 (unsigned long)btrfs_header_fsid(leaf),
112 BTRFS_FSID_SIZE);
113 btrfs_mark_buffer_dirty(leaf);
114
115 inode_item = &root_item.inode;
116 memset(inode_item, 0, sizeof(*inode_item));
117 inode_item->generation = cpu_to_le64(1);
118 inode_item->size = cpu_to_le64(3);
119 inode_item->nlink = cpu_to_le32(1);
120 inode_item->nbytes = cpu_to_le64(root->leafsize);
121 inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
122
123 btrfs_set_root_bytenr(&root_item, leaf->start);
124 btrfs_set_root_generation(&root_item, trans->transid);
125 btrfs_set_root_level(&root_item, 0);
126 btrfs_set_root_refs(&root_item, 0);
127 btrfs_set_root_used(&root_item, 0);
128
129 memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress));
130 root_item.drop_level = 0;
131
132 btrfs_tree_unlock(leaf);
133 free_extent_buffer(leaf);
134 leaf = NULL;
135
136 btrfs_set_root_dirid(&root_item, 0);
137
138 key.objectid = BTRFS_TREE_LOG_OBJECTID;
139 key.offset = objectid;
140 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
141 ret = btrfs_insert_root(trans, root->fs_info->log_root_tree, &key,
142 &root_item);
143 if (ret)
144 goto fail;
145
146 new_root = btrfs_read_fs_root_no_radix(root->fs_info->log_root_tree,
147 &key);
148 BUG_ON(!new_root);
149
150 WARN_ON(root->log_root);
151 root->log_root = new_root;
152
153 /*
154 * log trees do not get reference counted because they go away
155 * before a real commit is actually done. They do store pointers
156 * to file data extents, and those reference counts still get
157 * updated (along with back refs to the log tree).
158 */
159 new_root->ref_cows = 0;
160 new_root->last_trans = trans->transid;
161
162 /*
163 * we need to make sure the root block for this new tree
164 * is marked as dirty in the dirty_log_pages tree. This
165 * is how it gets flushed down to disk at tree log commit time.
166 *
167 * the tree logging mutex keeps others from coming in and changing
168 * the new_root->node, so we can safely access it here
169 */
170 set_extent_dirty(&new_root->dirty_log_pages, new_root->node->start,
171 new_root->node->start + new_root->node->len - 1,
172 GFP_NOFS);
173
174fail:
175 return ret;
176}
177
178/*
179 * start a sub transaction and setup the log tree
180 * this increments the log tree writer count to make the people
181 * syncing the tree wait for us to finish
182 */
183static int start_log_trans(struct btrfs_trans_handle *trans,
184 struct btrfs_root *root)
185{
186 int ret;
187 mutex_lock(&root->fs_info->tree_log_mutex);
188 if (!root->fs_info->log_root_tree) {
189 ret = btrfs_init_log_root_tree(trans, root->fs_info);
190 BUG_ON(ret);
191 }
192 if (!root->log_root) {
193 ret = btrfs_add_log_tree(trans, root);
194 BUG_ON(ret);
195 }
196 atomic_inc(&root->fs_info->tree_log_writers);
197 root->fs_info->tree_log_batch++;
198 mutex_unlock(&root->fs_info->tree_log_mutex);
199 return 0;
200}
201
202/*
203 * returns 0 if there was a log transaction running and we were able
204 * to join, or returns -ENOENT if there were not transactions
205 * in progress
206 */
207static int join_running_log_trans(struct btrfs_root *root)
208{
209 int ret = -ENOENT;
210
211 smp_mb();
212 if (!root->log_root)
213 return -ENOENT;
214
215 mutex_lock(&root->fs_info->tree_log_mutex);
216 if (root->log_root) {
217 ret = 0;
218 atomic_inc(&root->fs_info->tree_log_writers);
219 root->fs_info->tree_log_batch++;
220 }
221 mutex_unlock(&root->fs_info->tree_log_mutex);
222 return ret;
223}
224
225/*
226 * indicate we're done making changes to the log tree
227 * and wake up anyone waiting to do a sync
228 */
229static int end_log_trans(struct btrfs_root *root)
230{
231 atomic_dec(&root->fs_info->tree_log_writers);
232 smp_mb();
233 if (waitqueue_active(&root->fs_info->tree_log_wait))
234 wake_up(&root->fs_info->tree_log_wait);
235 return 0;
236}
237
238
239/*
240 * the walk control struct is used to pass state down the chain when
241 * processing the log tree. The stage field tells us which part
242 * of the log tree processing we are currently doing. The others
243 * are state fields used for that specific part
244 */
245struct walk_control {
246 /* should we free the extent on disk when done? This is used
247 * at transaction commit time while freeing a log tree
248 */
249 int free;
250
251 /* should we write out the extent buffer? This is used
252 * while flushing the log tree to disk during a sync
253 */
254 int write;
255
256 /* should we wait for the extent buffer io to finish? Also used
257 * while flushing the log tree to disk for a sync
258 */
259 int wait;
260
261 /* pin only walk, we record which extents on disk belong to the
262 * log trees
263 */
264 int pin;
265
266 /* what stage of the replay code we're currently in */
267 int stage;
268
269 /* the root we are currently replaying */
270 struct btrfs_root *replay_dest;
271
272 /* the trans handle for the current replay */
273 struct btrfs_trans_handle *trans;
274
275 /* the function that gets used to process blocks we find in the
276 * tree. Note the extent_buffer might not be up to date when it is
277 * passed in, and it must be checked or read if you need the data
278 * inside it
279 */
280 int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
281 struct walk_control *wc, u64 gen);
282};
283
284/*
285 * process_func used to pin down extents, write them or wait on them
286 */
287static int process_one_buffer(struct btrfs_root *log,
288 struct extent_buffer *eb,
289 struct walk_control *wc, u64 gen)
290{
291 if (wc->pin) {
292 mutex_lock(&log->fs_info->pinned_mutex);
293 btrfs_update_pinned_extents(log->fs_info->extent_root,
294 eb->start, eb->len, 1);
295 mutex_unlock(&log->fs_info->pinned_mutex);
296 }
297
298 if (btrfs_buffer_uptodate(eb, gen)) {
299 if (wc->write)
300 btrfs_write_tree_block(eb);
301 if (wc->wait)
302 btrfs_wait_tree_block_writeback(eb);
303 }
304 return 0;
305}
306
307/*
308 * Item overwrite used by replay and tree logging. eb, slot and key all refer
309 * to the src data we are copying out.
310 *
311 * root is the tree we are copying into, and path is a scratch
312 * path for use in this function (it should be released on entry and
313 * will be released on exit).
314 *
315 * If the key is already in the destination tree the existing item is
316 * overwritten. If the existing item isn't big enough, it is extended.
317 * If it is too large, it is truncated.
318 *
319 * If the key isn't in the destination yet, a new item is inserted.
320 */
321static noinline int overwrite_item(struct btrfs_trans_handle *trans,
322 struct btrfs_root *root,
323 struct btrfs_path *path,
324 struct extent_buffer *eb, int slot,
325 struct btrfs_key *key)
326{
327 int ret;
328 u32 item_size;
329 u64 saved_i_size = 0;
330 int save_old_i_size = 0;
331 unsigned long src_ptr;
332 unsigned long dst_ptr;
333 int overwrite_root = 0;
334
335 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
336 overwrite_root = 1;
337
338 item_size = btrfs_item_size_nr(eb, slot);
339 src_ptr = btrfs_item_ptr_offset(eb, slot);
340
341 /* look for the key in the destination tree */
342 ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
343 if (ret == 0) {
344 char *src_copy;
345 char *dst_copy;
346 u32 dst_size = btrfs_item_size_nr(path->nodes[0],
347 path->slots[0]);
348 if (dst_size != item_size)
349 goto insert;
350
351 if (item_size == 0) {
352 btrfs_release_path(root, path);
353 return 0;
354 }
355 dst_copy = kmalloc(item_size, GFP_NOFS);
356 src_copy = kmalloc(item_size, GFP_NOFS);
357
358 read_extent_buffer(eb, src_copy, src_ptr, item_size);
359
360 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
361 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
362 item_size);
363 ret = memcmp(dst_copy, src_copy, item_size);
364
365 kfree(dst_copy);
366 kfree(src_copy);
367 /*
368 * they have the same contents, just return, this saves
369 * us from cowing blocks in the destination tree and doing
370 * extra writes that may not have been done by a previous
371 * sync
372 */
373 if (ret == 0) {
374 btrfs_release_path(root, path);
375 return 0;
376 }
377
378 }
379insert:
380 btrfs_release_path(root, path);
381 /* try to insert the key into the destination tree */
382 ret = btrfs_insert_empty_item(trans, root, path,
383 key, item_size);
384
385 /* make sure any existing item is the correct size */
386 if (ret == -EEXIST) {
387 u32 found_size;
388 found_size = btrfs_item_size_nr(path->nodes[0],
389 path->slots[0]);
390 if (found_size > item_size) {
391 btrfs_truncate_item(trans, root, path, item_size, 1);
392 } else if (found_size < item_size) {
393 ret = btrfs_extend_item(trans, root, path,
394 item_size - found_size);
395 BUG_ON(ret);
396 }
397 } else if (ret) {
398 BUG();
399 }
400 dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
401 path->slots[0]);
402
403 /* don't overwrite an existing inode if the generation number
404 * was logged as zero. This is done when the tree logging code
405 * is just logging an inode to make sure it exists after recovery.
406 *
407 * Also, don't overwrite i_size on directories during replay.
408 * log replay inserts and removes directory items based on the
409 * state of the tree found in the subvolume, and i_size is modified
410 * as it goes
411 */
412 if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
413 struct btrfs_inode_item *src_item;
414 struct btrfs_inode_item *dst_item;
415
416 src_item = (struct btrfs_inode_item *)src_ptr;
417 dst_item = (struct btrfs_inode_item *)dst_ptr;
418
419 if (btrfs_inode_generation(eb, src_item) == 0)
420 goto no_copy;
421
422 if (overwrite_root &&
423 S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
424 S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
425 save_old_i_size = 1;
426 saved_i_size = btrfs_inode_size(path->nodes[0],
427 dst_item);
428 }
429 }
430
431 copy_extent_buffer(path->nodes[0], eb, dst_ptr,
432 src_ptr, item_size);
433
434 if (save_old_i_size) {
435 struct btrfs_inode_item *dst_item;
436 dst_item = (struct btrfs_inode_item *)dst_ptr;
437 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
438 }
439
440 /* make sure the generation is filled in */
441 if (key->type == BTRFS_INODE_ITEM_KEY) {
442 struct btrfs_inode_item *dst_item;
443 dst_item = (struct btrfs_inode_item *)dst_ptr;
444 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
445 btrfs_set_inode_generation(path->nodes[0], dst_item,
446 trans->transid);
447 }
448 }
449no_copy:
450 btrfs_mark_buffer_dirty(path->nodes[0]);
451 btrfs_release_path(root, path);
452 return 0;
453}
454
455/*
456 * simple helper to read an inode off the disk from a given root
457 * This can only be called for subvolume roots and not for the log
458 */
459static noinline struct inode *read_one_inode(struct btrfs_root *root,
460 u64 objectid)
461{
462 struct inode *inode;
463 inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
464 if (inode->i_state & I_NEW) {
465 BTRFS_I(inode)->root = root;
466 BTRFS_I(inode)->location.objectid = objectid;
467 BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
468 BTRFS_I(inode)->location.offset = 0;
469 btrfs_read_locked_inode(inode);
470 unlock_new_inode(inode);
471
472 }
473 if (is_bad_inode(inode)) {
474 iput(inode);
475 inode = NULL;
476 }
477 return inode;
478}
479
480/* replays a single extent in 'eb' at 'slot' with 'key' into the
481 * subvolume 'root'. path is released on entry and should be released
482 * on exit.
483 *
484 * extents in the log tree have not been allocated out of the extent
485 * tree yet. So, this completes the allocation, taking a reference
486 * as required if the extent already exists or creating a new extent
487 * if it isn't in the extent allocation tree yet.
488 *
489 * The extent is inserted into the file, dropping any existing extents
490 * from the file that overlap the new one.
491 */
492static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
493 struct btrfs_root *root,
494 struct btrfs_path *path,
495 struct extent_buffer *eb, int slot,
496 struct btrfs_key *key)
497{
498 int found_type;
499 u64 mask = root->sectorsize - 1;
500 u64 extent_end;
501 u64 alloc_hint;
502 u64 start = key->offset;
503 u64 saved_nbytes;
504 struct btrfs_file_extent_item *item;
505 struct inode *inode = NULL;
506 unsigned long size;
507 int ret = 0;
508
509 item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
510 found_type = btrfs_file_extent_type(eb, item);
511
512 if (found_type == BTRFS_FILE_EXTENT_REG ||
513 found_type == BTRFS_FILE_EXTENT_PREALLOC)
514 extent_end = start + btrfs_file_extent_num_bytes(eb, item);
515 else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
516 size = btrfs_file_extent_inline_len(eb, item);
517 extent_end = (start + size + mask) & ~mask;
518 } else {
519 ret = 0;
520 goto out;
521 }
522
523 inode = read_one_inode(root, key->objectid);
524 if (!inode) {
525 ret = -EIO;
526 goto out;
527 }
528
529 /*
530 * first check to see if we already have this extent in the
531 * file. This must be done before the btrfs_drop_extents run
532 * so we don't try to drop this extent.
533 */
534 ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
535 start, 0);
536
537 if (ret == 0 &&
538 (found_type == BTRFS_FILE_EXTENT_REG ||
539 found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
540 struct btrfs_file_extent_item cmp1;
541 struct btrfs_file_extent_item cmp2;
542 struct btrfs_file_extent_item *existing;
543 struct extent_buffer *leaf;
544
545 leaf = path->nodes[0];
546 existing = btrfs_item_ptr(leaf, path->slots[0],
547 struct btrfs_file_extent_item);
548
549 read_extent_buffer(eb, &cmp1, (unsigned long)item,
550 sizeof(cmp1));
551 read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
552 sizeof(cmp2));
553
554 /*
555 * we already have a pointer to this exact extent,
556 * we don't have to do anything
557 */
558 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
559 btrfs_release_path(root, path);
560 goto out;
561 }
562 }
563 btrfs_release_path(root, path);
564
565 saved_nbytes = inode_get_bytes(inode);
566 /* drop any overlapping extents */
567 ret = btrfs_drop_extents(trans, root, inode,
568 start, extent_end, start, &alloc_hint);
569 BUG_ON(ret);
570
571 if (found_type == BTRFS_FILE_EXTENT_REG ||
572 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
573 unsigned long dest_offset;
574 struct btrfs_key ins;
575
576 ret = btrfs_insert_empty_item(trans, root, path, key,
577 sizeof(*item));
578 BUG_ON(ret);
579 dest_offset = btrfs_item_ptr_offset(path->nodes[0],
580 path->slots[0]);
581 copy_extent_buffer(path->nodes[0], eb, dest_offset,
582 (unsigned long)item, sizeof(*item));
583
584 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
585 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
586 ins.type = BTRFS_EXTENT_ITEM_KEY;
587
588 if (ins.objectid > 0) {
589 u64 csum_start;
590 u64 csum_end;
591 LIST_HEAD(ordered_sums);
592 /*
593 * is this extent already allocated in the extent
594 * allocation tree? If so, just add a reference
595 */
596 ret = btrfs_lookup_extent(root, ins.objectid,
597 ins.offset);
598 if (ret == 0) {
599 ret = btrfs_inc_extent_ref(trans, root,
600 ins.objectid, ins.offset,
601 path->nodes[0]->start,
602 root->root_key.objectid,
603 trans->transid, key->objectid);
604 } else {
605 /*
606 * insert the extent pointer in the extent
607 * allocation tree
608 */
609 ret = btrfs_alloc_logged_extent(trans, root,
610 path->nodes[0]->start,
611 root->root_key.objectid,
612 trans->transid, key->objectid,
613 &ins);
614 BUG_ON(ret);
615 }
616 btrfs_release_path(root, path);
617
618 if (btrfs_file_extent_compression(eb, item)) {
619 csum_start = ins.objectid;
620 csum_end = csum_start + ins.offset;
621 } else {
622 csum_start = ins.objectid +
623 btrfs_file_extent_offset(eb, item);
624 csum_end = csum_start +
625 btrfs_file_extent_num_bytes(eb, item);
626 }
627
628 ret = btrfs_lookup_csums_range(root->log_root,
629 csum_start, csum_end - 1,
630 &ordered_sums);
631 BUG_ON(ret);
632 while (!list_empty(&ordered_sums)) {
633 struct btrfs_ordered_sum *sums;
634 sums = list_entry(ordered_sums.next,
635 struct btrfs_ordered_sum,
636 list);
637 ret = btrfs_csum_file_blocks(trans,
638 root->fs_info->csum_root,
639 sums);
640 BUG_ON(ret);
641 list_del(&sums->list);
642 kfree(sums);
643 }
644 } else {
645 btrfs_release_path(root, path);
646 }
647 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
648 /* inline extents are easy, we just overwrite them */
649 ret = overwrite_item(trans, root, path, eb, slot, key);
650 BUG_ON(ret);
651 }
652
653 inode_set_bytes(inode, saved_nbytes);
654 btrfs_update_inode(trans, root, inode);
655out:
656 if (inode)
657 iput(inode);
658 return ret;
659}
660
661/*
662 * when cleaning up conflicts between the directory names in the
663 * subvolume, directory names in the log and directory names in the
664 * inode back references, we may have to unlink inodes from directories.
665 *
666 * This is a helper function to do the unlink of a specific directory
667 * item
668 */
669static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
670 struct btrfs_root *root,
671 struct btrfs_path *path,
672 struct inode *dir,
673 struct btrfs_dir_item *di)
674{
675 struct inode *inode;
676 char *name;
677 int name_len;
678 struct extent_buffer *leaf;
679 struct btrfs_key location;
680 int ret;
681
682 leaf = path->nodes[0];
683
684 btrfs_dir_item_key_to_cpu(leaf, di, &location);
685 name_len = btrfs_dir_name_len(leaf, di);
686 name = kmalloc(name_len, GFP_NOFS);
687 read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
688 btrfs_release_path(root, path);
689
690 inode = read_one_inode(root, location.objectid);
691 BUG_ON(!inode);
692
693 ret = link_to_fixup_dir(trans, root, path, location.objectid);
694 BUG_ON(ret);
695 ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
696 BUG_ON(ret);
697 kfree(name);
698
699 iput(inode);
700 return ret;
701}
702
703/*
704 * helper function to see if a given name and sequence number found
705 * in an inode back reference are already in a directory and correctly
706 * point to this inode
707 */
708static noinline int inode_in_dir(struct btrfs_root *root,
709 struct btrfs_path *path,
710 u64 dirid, u64 objectid, u64 index,
711 const char *name, int name_len)
712{
713 struct btrfs_dir_item *di;
714 struct btrfs_key location;
715 int match = 0;
716
717 di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
718 index, name, name_len, 0);
719 if (di && !IS_ERR(di)) {
720 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
721 if (location.objectid != objectid)
722 goto out;
723 } else
724 goto out;
725 btrfs_release_path(root, path);
726
727 di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
728 if (di && !IS_ERR(di)) {
729 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
730 if (location.objectid != objectid)
731 goto out;
732 } else
733 goto out;
734 match = 1;
735out:
736 btrfs_release_path(root, path);
737 return match;
738}
739
740/*
741 * helper function to check a log tree for a named back reference in
742 * an inode. This is used to decide if a back reference that is
743 * found in the subvolume conflicts with what we find in the log.
744 *
745 * inode backreferences may have multiple refs in a single item,
746 * during replay we process one reference at a time, and we don't
747 * want to delete valid links to a file from the subvolume if that
748 * link is also in the log.
749 */
750static noinline int backref_in_log(struct btrfs_root *log,
751 struct btrfs_key *key,
752 char *name, int namelen)
753{
754 struct btrfs_path *path;
755 struct btrfs_inode_ref *ref;
756 unsigned long ptr;
757 unsigned long ptr_end;
758 unsigned long name_ptr;
759 int found_name_len;
760 int item_size;
761 int ret;
762 int match = 0;
763
764 path = btrfs_alloc_path();
765 ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
766 if (ret != 0)
767 goto out;
768
769 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
770 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
771 ptr_end = ptr + item_size;
772 while (ptr < ptr_end) {
773 ref = (struct btrfs_inode_ref *)ptr;
774 found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref);
775 if (found_name_len == namelen) {
776 name_ptr = (unsigned long)(ref + 1);
777 ret = memcmp_extent_buffer(path->nodes[0], name,
778 name_ptr, namelen);
779 if (ret == 0) {
780 match = 1;
781 goto out;
782 }
783 }
784 ptr = (unsigned long)(ref + 1) + found_name_len;
785 }
786out:
787 btrfs_free_path(path);
788 return match;
789}
790
791
792/*
793 * replay one inode back reference item found in the log tree.
794 * eb, slot and key refer to the buffer and key found in the log tree.
795 * root is the destination we are replaying into, and path is for temp
796 * use by this function. (it should be released on return).
797 */
798static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
799 struct btrfs_root *root,
800 struct btrfs_root *log,
801 struct btrfs_path *path,
802 struct extent_buffer *eb, int slot,
803 struct btrfs_key *key)
804{
805 struct inode *dir;
806 int ret;
807 struct btrfs_key location;
808 struct btrfs_inode_ref *ref;
809 struct btrfs_dir_item *di;
810 struct inode *inode;
811 char *name;
812 int namelen;
813 unsigned long ref_ptr;
814 unsigned long ref_end;
815
816 location.objectid = key->objectid;
817 location.type = BTRFS_INODE_ITEM_KEY;
818 location.offset = 0;
819
820 /*
821 * it is possible that we didn't log all the parent directories
822 * for a given inode. If we don't find the dir, just don't
823 * copy the back ref in. The link count fixup code will take
824 * care of the rest
825 */
826 dir = read_one_inode(root, key->offset);
827 if (!dir)
828 return -ENOENT;
829
830 inode = read_one_inode(root, key->objectid);
831 BUG_ON(!dir);
832
833 ref_ptr = btrfs_item_ptr_offset(eb, slot);
834 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
835
836again:
837 ref = (struct btrfs_inode_ref *)ref_ptr;
838
839 namelen = btrfs_inode_ref_name_len(eb, ref);
840 name = kmalloc(namelen, GFP_NOFS);
841 BUG_ON(!name);
842
843 read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen);
844
845 /* if we already have a perfect match, we're done */
846 if (inode_in_dir(root, path, dir->i_ino, inode->i_ino,
847 btrfs_inode_ref_index(eb, ref),
848 name, namelen)) {
849 goto out;
850 }
851
852 /*
853 * look for a conflicting back reference in the metadata.
854 * if we find one we have to unlink that name of the file
855 * before we add our new link. Later on, we overwrite any
856 * existing back reference, and we don't want to create
857 * dangling pointers in the directory.
858 */
859conflict_again:
860 ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
861 if (ret == 0) {
862 char *victim_name;
863 int victim_name_len;
864 struct btrfs_inode_ref *victim_ref;
865 unsigned long ptr;
866 unsigned long ptr_end;
867 struct extent_buffer *leaf = path->nodes[0];
868
869 /* are we trying to overwrite a back ref for the root directory
870 * if so, just jump out, we're done
871 */
872 if (key->objectid == key->offset)
873 goto out_nowrite;
874
875 /* check all the names in this back reference to see
876 * if they are in the log. if so, we allow them to stay
877 * otherwise they must be unlinked as a conflict
878 */
879 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
880 ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
881 while (ptr < ptr_end) {
882 victim_ref = (struct btrfs_inode_ref *)ptr;
883 victim_name_len = btrfs_inode_ref_name_len(leaf,
884 victim_ref);
885 victim_name = kmalloc(victim_name_len, GFP_NOFS);
886 BUG_ON(!victim_name);
887
888 read_extent_buffer(leaf, victim_name,
889 (unsigned long)(victim_ref + 1),
890 victim_name_len);
891
892 if (!backref_in_log(log, key, victim_name,
893 victim_name_len)) {
894 btrfs_inc_nlink(inode);
895 btrfs_release_path(root, path);
896 ret = btrfs_unlink_inode(trans, root, dir,
897 inode, victim_name,
898 victim_name_len);
899 kfree(victim_name);
900 btrfs_release_path(root, path);
901 goto conflict_again;
902 }
903 kfree(victim_name);
904 ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
905 }
906 BUG_ON(ret);
907 }
908 btrfs_release_path(root, path);
909
910 /* look for a conflicting sequence number */
911 di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
912 btrfs_inode_ref_index(eb, ref),
913 name, namelen, 0);
914 if (di && !IS_ERR(di)) {
915 ret = drop_one_dir_item(trans, root, path, dir, di);
916 BUG_ON(ret);
917 }
918 btrfs_release_path(root, path);
919
920
921 /* look for a conflicting name */
922 di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
923 name, namelen, 0);
924 if (di && !IS_ERR(di)) {
925 ret = drop_one_dir_item(trans, root, path, dir, di);
926 BUG_ON(ret);
927 }
928 btrfs_release_path(root, path);
929
930 /* insert our name */
931 ret = btrfs_add_link(trans, dir, inode, name, namelen, 0,
932 btrfs_inode_ref_index(eb, ref));
933 BUG_ON(ret);
934
935 btrfs_update_inode(trans, root, inode);
936
937out:
938 ref_ptr = (unsigned long)(ref + 1) + namelen;
939 kfree(name);
940 if (ref_ptr < ref_end)
941 goto again;
942
943 /* finally write the back reference in the inode */
944 ret = overwrite_item(trans, root, path, eb, slot, key);
945 BUG_ON(ret);
946
947out_nowrite:
948 btrfs_release_path(root, path);
949 iput(dir);
950 iput(inode);
951 return 0;
952}
953
954/*
955 * There are a few corners where the link count of the file can't
956 * be properly maintained during replay. So, instead of adding
957 * lots of complexity to the log code, we just scan the backrefs
958 * for any file that has been through replay.
959 *
960 * The scan will update the link count on the inode to reflect the
961 * number of back refs found. If it goes down to zero, the iput
962 * will free the inode.
963 */
964static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
965 struct btrfs_root *root,
966 struct inode *inode)
967{
968 struct btrfs_path *path;
969 int ret;
970 struct btrfs_key key;
971 u64 nlink = 0;
972 unsigned long ptr;
973 unsigned long ptr_end;
974 int name_len;
975
976 key.objectid = inode->i_ino;
977 key.type = BTRFS_INODE_REF_KEY;
978 key.offset = (u64)-1;
979
980 path = btrfs_alloc_path();
981
982 while (1) {
983 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
984 if (ret < 0)
985 break;
986 if (ret > 0) {
987 if (path->slots[0] == 0)
988 break;
989 path->slots[0]--;
990 }
991 btrfs_item_key_to_cpu(path->nodes[0], &key,
992 path->slots[0]);
993 if (key.objectid != inode->i_ino ||
994 key.type != BTRFS_INODE_REF_KEY)
995 break;
996 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
997 ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
998 path->slots[0]);
999 while (ptr < ptr_end) {
1000 struct btrfs_inode_ref *ref;
1001
1002 ref = (struct btrfs_inode_ref *)ptr;
1003 name_len = btrfs_inode_ref_name_len(path->nodes[0],
1004 ref);
1005 ptr = (unsigned long)(ref + 1) + name_len;
1006 nlink++;
1007 }
1008
1009 if (key.offset == 0)
1010 break;
1011 key.offset--;
1012 btrfs_release_path(root, path);
1013 }
1014 btrfs_free_path(path);
1015 if (nlink != inode->i_nlink) {
1016 inode->i_nlink = nlink;
1017 btrfs_update_inode(trans, root, inode);
1018 }
1019 BTRFS_I(inode)->index_cnt = (u64)-1;
1020
1021 return 0;
1022}
1023
1024static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
1025 struct btrfs_root *root,
1026 struct btrfs_path *path)
1027{
1028 int ret;
1029 struct btrfs_key key;
1030 struct inode *inode;
1031
1032 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1033 key.type = BTRFS_ORPHAN_ITEM_KEY;
1034 key.offset = (u64)-1;
1035 while (1) {
1036 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1037 if (ret < 0)
1038 break;
1039
1040 if (ret == 1) {
1041 if (path->slots[0] == 0)
1042 break;
1043 path->slots[0]--;
1044 }
1045
1046 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1047 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
1048 key.type != BTRFS_ORPHAN_ITEM_KEY)
1049 break;
1050
1051 ret = btrfs_del_item(trans, root, path);
1052 BUG_ON(ret);
1053
1054 btrfs_release_path(root, path);
1055 inode = read_one_inode(root, key.offset);
1056 BUG_ON(!inode);
1057
1058 ret = fixup_inode_link_count(trans, root, inode);
1059 BUG_ON(ret);
1060
1061 iput(inode);
1062
1063 if (key.offset == 0)
1064 break;
1065 key.offset--;
1066 }
1067 btrfs_release_path(root, path);
1068 return 0;
1069}
1070
1071
1072/*
1073 * record a given inode in the fixup dir so we can check its link
1074 * count when replay is done. The link count is incremented here
1075 * so the inode won't go away until we check it
1076 */
1077static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
1078 struct btrfs_root *root,
1079 struct btrfs_path *path,
1080 u64 objectid)
1081{
1082 struct btrfs_key key;
1083 int ret = 0;
1084 struct inode *inode;
1085
1086 inode = read_one_inode(root, objectid);
1087 BUG_ON(!inode);
1088
1089 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1090 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
1091 key.offset = objectid;
1092
1093 ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1094
1095 btrfs_release_path(root, path);
1096 if (ret == 0) {
1097 btrfs_inc_nlink(inode);
1098 btrfs_update_inode(trans, root, inode);
1099 } else if (ret == -EEXIST) {
1100 ret = 0;
1101 } else {
1102 BUG();
1103 }
1104 iput(inode);
1105
1106 return ret;
1107}
1108
1109/*
1110 * when replaying the log for a directory, we only insert names
1111 * for inodes that actually exist. This means an fsync on a directory
1112 * does not implicitly fsync all the new files in it
1113 */
1114static noinline int insert_one_name(struct btrfs_trans_handle *trans,
1115 struct btrfs_root *root,
1116 struct btrfs_path *path,
1117 u64 dirid, u64 index,
1118 char *name, int name_len, u8 type,
1119 struct btrfs_key *location)
1120{
1121 struct inode *inode;
1122 struct inode *dir;
1123 int ret;
1124
1125 inode = read_one_inode(root, location->objectid);
1126 if (!inode)
1127 return -ENOENT;
1128
1129 dir = read_one_inode(root, dirid);
1130 if (!dir) {
1131 iput(inode);
1132 return -EIO;
1133 }
1134 ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index);
1135
1136 /* FIXME, put inode into FIXUP list */
1137
1138 iput(inode);
1139 iput(dir);
1140 return ret;
1141}
1142
1143/*
1144 * take a single entry in a log directory item and replay it into
1145 * the subvolume.
1146 *
1147 * if a conflicting item exists in the subdirectory already,
1148 * the inode it points to is unlinked and put into the link count
1149 * fix up tree.
1150 *
1151 * If a name from the log points to a file or directory that does
1152 * not exist in the FS, it is skipped. fsyncs on directories
1153 * do not force down inodes inside that directory, just changes to the
1154 * names or unlinks in a directory.
1155 */
1156static noinline int replay_one_name(struct btrfs_trans_handle *trans,
1157 struct btrfs_root *root,
1158 struct btrfs_path *path,
1159 struct extent_buffer *eb,
1160 struct btrfs_dir_item *di,
1161 struct btrfs_key *key)
1162{
1163 char *name;
1164 int name_len;
1165 struct btrfs_dir_item *dst_di;
1166 struct btrfs_key found_key;
1167 struct btrfs_key log_key;
1168 struct inode *dir;
1169 u8 log_type;
1170 int exists;
1171 int ret;
1172
1173 dir = read_one_inode(root, key->objectid);
1174 BUG_ON(!dir);
1175
1176 name_len = btrfs_dir_name_len(eb, di);
1177 name = kmalloc(name_len, GFP_NOFS);
1178 log_type = btrfs_dir_type(eb, di);
1179 read_extent_buffer(eb, name, (unsigned long)(di + 1),
1180 name_len);
1181
1182 btrfs_dir_item_key_to_cpu(eb, di, &log_key);
1183 exists = btrfs_lookup_inode(trans, root, path, &log_key, 0);
1184 if (exists == 0)
1185 exists = 1;
1186 else
1187 exists = 0;
1188 btrfs_release_path(root, path);
1189
1190 if (key->type == BTRFS_DIR_ITEM_KEY) {
1191 dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
1192 name, name_len, 1);
1193 } else if (key->type == BTRFS_DIR_INDEX_KEY) {
1194 dst_di = btrfs_lookup_dir_index_item(trans, root, path,
1195 key->objectid,
1196 key->offset, name,
1197 name_len, 1);
1198 } else {
1199 BUG();
1200 }
1201 if (!dst_di || IS_ERR(dst_di)) {
1202 /* we need a sequence number to insert, so we only
1203 * do inserts for the BTRFS_DIR_INDEX_KEY types
1204 */
1205 if (key->type != BTRFS_DIR_INDEX_KEY)
1206 goto out;
1207 goto insert;
1208 }
1209
1210 btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
1211 /* the existing item matches the logged item */
1212 if (found_key.objectid == log_key.objectid &&
1213 found_key.type == log_key.type &&
1214 found_key.offset == log_key.offset &&
1215 btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
1216 goto out;
1217 }
1218
1219 /*
1220 * don't drop the conflicting directory entry if the inode
1221 * for the new entry doesn't exist
1222 */
1223 if (!exists)
1224 goto out;
1225
1226 ret = drop_one_dir_item(trans, root, path, dir, dst_di);
1227 BUG_ON(ret);
1228
1229 if (key->type == BTRFS_DIR_INDEX_KEY)
1230 goto insert;
1231out:
1232 btrfs_release_path(root, path);
1233 kfree(name);
1234 iput(dir);
1235 return 0;
1236
1237insert:
1238 btrfs_release_path(root, path);
1239 ret = insert_one_name(trans, root, path, key->objectid, key->offset,
1240 name, name_len, log_type, &log_key);
1241
1242 if (ret && ret != -ENOENT)
1243 BUG();
1244 goto out;
1245}
1246
1247/*
1248 * find all the names in a directory item and reconcile them into
1249 * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than
1250 * one name in a directory item, but the same code gets used for
1251 * both directory index types
1252 */
1253static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
1254 struct btrfs_root *root,
1255 struct btrfs_path *path,
1256 struct extent_buffer *eb, int slot,
1257 struct btrfs_key *key)
1258{
1259 int ret;
1260 u32 item_size = btrfs_item_size_nr(eb, slot);
1261 struct btrfs_dir_item *di;
1262 int name_len;
1263 unsigned long ptr;
1264 unsigned long ptr_end;
1265
1266 ptr = btrfs_item_ptr_offset(eb, slot);
1267 ptr_end = ptr + item_size;
1268 while (ptr < ptr_end) {
1269 di = (struct btrfs_dir_item *)ptr;
1270 name_len = btrfs_dir_name_len(eb, di);
1271 ret = replay_one_name(trans, root, path, eb, di, key);
1272 BUG_ON(ret);
1273 ptr = (unsigned long)(di + 1);
1274 ptr += name_len;
1275 }
1276 return 0;
1277}
1278
1279/*
1280 * directory replay has two parts. There are the standard directory
1281 * items in the log copied from the subvolume, and range items
1282 * created in the log while the subvolume was logged.
1283 *
1284 * The range items tell us which parts of the key space the log
1285 * is authoritative for. During replay, if a key in the subvolume
1286 * directory is in a logged range item, but not actually in the log
1287 * that means it was deleted from the directory before the fsync
1288 * and should be removed.
1289 */
1290static noinline int find_dir_range(struct btrfs_root *root,
1291 struct btrfs_path *path,
1292 u64 dirid, int key_type,
1293 u64 *start_ret, u64 *end_ret)
1294{
1295 struct btrfs_key key;
1296 u64 found_end;
1297 struct btrfs_dir_log_item *item;
1298 int ret;
1299 int nritems;
1300
1301 if (*start_ret == (u64)-1)
1302 return 1;
1303
1304 key.objectid = dirid;
1305 key.type = key_type;
1306 key.offset = *start_ret;
1307
1308 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1309 if (ret < 0)
1310 goto out;
1311 if (ret > 0) {
1312 if (path->slots[0] == 0)
1313 goto out;
1314 path->slots[0]--;
1315 }
1316 if (ret != 0)
1317 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1318
1319 if (key.type != key_type || key.objectid != dirid) {
1320 ret = 1;
1321 goto next;
1322 }
1323 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
1324 struct btrfs_dir_log_item);
1325 found_end = btrfs_dir_log_end(path->nodes[0], item);
1326
1327 if (*start_ret >= key.offset && *start_ret <= found_end) {
1328 ret = 0;
1329 *start_ret = key.offset;
1330 *end_ret = found_end;
1331 goto out;
1332 }
1333 ret = 1;
1334next:
1335 /* check the next slot in the tree to see if it is a valid item */
1336 nritems = btrfs_header_nritems(path->nodes[0]);
1337 if (path->slots[0] >= nritems) {
1338 ret = btrfs_next_leaf(root, path);
1339 if (ret)
1340 goto out;
1341 } else {
1342 path->slots[0]++;
1343 }
1344
1345 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1346
1347 if (key.type != key_type || key.objectid != dirid) {
1348 ret = 1;
1349 goto out;
1350 }
1351 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
1352 struct btrfs_dir_log_item);
1353 found_end = btrfs_dir_log_end(path->nodes[0], item);
1354 *start_ret = key.offset;
1355 *end_ret = found_end;
1356 ret = 0;
1357out:
1358 btrfs_release_path(root, path);
1359 return ret;
1360}
1361
1362/*
1363 * this looks for a given directory item in the log. If the directory
1364 * item is not in the log, the item is removed and the inode it points
1365 * to is unlinked
1366 */
1367static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
1368 struct btrfs_root *root,
1369 struct btrfs_root *log,
1370 struct btrfs_path *path,
1371 struct btrfs_path *log_path,
1372 struct inode *dir,
1373 struct btrfs_key *dir_key)
1374{
1375 int ret;
1376 struct extent_buffer *eb;
1377 int slot;
1378 u32 item_size;
1379 struct btrfs_dir_item *di;
1380 struct btrfs_dir_item *log_di;
1381 int name_len;
1382 unsigned long ptr;
1383 unsigned long ptr_end;
1384 char *name;
1385 struct inode *inode;
1386 struct btrfs_key location;
1387
1388again:
1389 eb = path->nodes[0];
1390 slot = path->slots[0];
1391 item_size = btrfs_item_size_nr(eb, slot);
1392 ptr = btrfs_item_ptr_offset(eb, slot);
1393 ptr_end = ptr + item_size;
1394 while (ptr < ptr_end) {
1395 di = (struct btrfs_dir_item *)ptr;
1396 name_len = btrfs_dir_name_len(eb, di);
1397 name = kmalloc(name_len, GFP_NOFS);
1398 if (!name) {
1399 ret = -ENOMEM;
1400 goto out;
1401 }
1402 read_extent_buffer(eb, name, (unsigned long)(di + 1),
1403 name_len);
1404 log_di = NULL;
1405 if (dir_key->type == BTRFS_DIR_ITEM_KEY) {
1406 log_di = btrfs_lookup_dir_item(trans, log, log_path,
1407 dir_key->objectid,
1408 name, name_len, 0);
1409 } else if (dir_key->type == BTRFS_DIR_INDEX_KEY) {
1410 log_di = btrfs_lookup_dir_index_item(trans, log,
1411 log_path,
1412 dir_key->objectid,
1413 dir_key->offset,
1414 name, name_len, 0);
1415 }
1416 if (!log_di || IS_ERR(log_di)) {
1417 btrfs_dir_item_key_to_cpu(eb, di, &location);
1418 btrfs_release_path(root, path);
1419 btrfs_release_path(log, log_path);
1420 inode = read_one_inode(root, location.objectid);
1421 BUG_ON(!inode);
1422
1423 ret = link_to_fixup_dir(trans, root,
1424 path, location.objectid);
1425 BUG_ON(ret);
1426 btrfs_inc_nlink(inode);
1427 ret = btrfs_unlink_inode(trans, root, dir, inode,
1428 name, name_len);
1429 BUG_ON(ret);
1430 kfree(name);
1431 iput(inode);
1432
1433 /* there might still be more names under this key
1434 * check and repeat if required
1435 */
1436 ret = btrfs_search_slot(NULL, root, dir_key, path,
1437 0, 0);
1438 if (ret == 0)
1439 goto again;
1440 ret = 0;
1441 goto out;
1442 }
1443 btrfs_release_path(log, log_path);
1444 kfree(name);
1445
1446 ptr = (unsigned long)(di + 1);
1447 ptr += name_len;
1448 }
1449 ret = 0;
1450out:
1451 btrfs_release_path(root, path);
1452 btrfs_release_path(log, log_path);
1453 return ret;
1454}
1455
1456/*
1457 * deletion replay happens before we copy any new directory items
1458 * out of the log or out of backreferences from inodes. It
1459 * scans the log to find ranges of keys that log is authoritative for,
1460 * and then scans the directory to find items in those ranges that are
1461 * not present in the log.
1462 *
1463 * Anything we don't find in the log is unlinked and removed from the
1464 * directory.
1465 */
1466static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
1467 struct btrfs_root *root,
1468 struct btrfs_root *log,
1469 struct btrfs_path *path,
1470 u64 dirid)
1471{
1472 u64 range_start;
1473 u64 range_end;
1474 int key_type = BTRFS_DIR_LOG_ITEM_KEY;
1475 int ret = 0;
1476 struct btrfs_key dir_key;
1477 struct btrfs_key found_key;
1478 struct btrfs_path *log_path;
1479 struct inode *dir;
1480
1481 dir_key.objectid = dirid;
1482 dir_key.type = BTRFS_DIR_ITEM_KEY;
1483 log_path = btrfs_alloc_path();
1484 if (!log_path)
1485 return -ENOMEM;
1486
1487 dir = read_one_inode(root, dirid);
1488 /* it isn't an error if the inode isn't there, that can happen
1489 * because we replay the deletes before we copy in the inode item
1490 * from the log
1491 */
1492 if (!dir) {
1493 btrfs_free_path(log_path);
1494 return 0;
1495 }
1496again:
1497 range_start = 0;
1498 range_end = 0;
1499 while (1) {
1500 ret = find_dir_range(log, path, dirid, key_type,
1501 &range_start, &range_end);
1502 if (ret != 0)
1503 break;
1504
1505 dir_key.offset = range_start;
1506 while (1) {
1507 int nritems;
1508 ret = btrfs_search_slot(NULL, root, &dir_key, path,
1509 0, 0);
1510 if (ret < 0)
1511 goto out;
1512
1513 nritems = btrfs_header_nritems(path->nodes[0]);
1514 if (path->slots[0] >= nritems) {
1515 ret = btrfs_next_leaf(root, path);
1516 if (ret)
1517 break;
1518 }
1519 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1520 path->slots[0]);
1521 if (found_key.objectid != dirid ||
1522 found_key.type != dir_key.type)
1523 goto next_type;
1524
1525 if (found_key.offset > range_end)
1526 break;
1527
1528 ret = check_item_in_log(trans, root, log, path,
1529 log_path, dir, &found_key);
1530 BUG_ON(ret);
1531 if (found_key.offset == (u64)-1)
1532 break;
1533 dir_key.offset = found_key.offset + 1;
1534 }
1535 btrfs_release_path(root, path);
1536 if (range_end == (u64)-1)
1537 break;
1538 range_start = range_end + 1;
1539 }
1540
1541next_type:
1542 ret = 0;
1543 if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
1544 key_type = BTRFS_DIR_LOG_INDEX_KEY;
1545 dir_key.type = BTRFS_DIR_INDEX_KEY;
1546 btrfs_release_path(root, path);
1547 goto again;
1548 }
1549out:
1550 btrfs_release_path(root, path);
1551 btrfs_free_path(log_path);
1552 iput(dir);
1553 return ret;
1554}
1555
1556/*
1557 * the process_func used to replay items from the log tree. This
1558 * gets called in two different stages. The first stage just looks
1559 * for inodes and makes sure they are all copied into the subvolume.
1560 *
1561 * The second stage copies all the other item types from the log into
1562 * the subvolume. The two stage approach is slower, but gets rid of
1563 * lots of complexity around inodes referencing other inodes that exist
1564 * only in the log (references come from either directory items or inode
1565 * back refs).
1566 */
1567static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
1568 struct walk_control *wc, u64 gen)
1569{
1570 int nritems;
1571 struct btrfs_path *path;
1572 struct btrfs_root *root = wc->replay_dest;
1573 struct btrfs_key key;
1574 u32 item_size;
1575 int level;
1576 int i;
1577 int ret;
1578
1579 btrfs_read_buffer(eb, gen);
1580
1581 level = btrfs_header_level(eb);
1582
1583 if (level != 0)
1584 return 0;
1585
1586 path = btrfs_alloc_path();
1587 BUG_ON(!path);
1588
1589 nritems = btrfs_header_nritems(eb);
1590 for (i = 0; i < nritems; i++) {
1591 btrfs_item_key_to_cpu(eb, &key, i);
1592 item_size = btrfs_item_size_nr(eb, i);
1593
1594 /* inode keys are done during the first stage */
1595 if (key.type == BTRFS_INODE_ITEM_KEY &&
1596 wc->stage == LOG_WALK_REPLAY_INODES) {
1597 struct inode *inode;
1598 struct btrfs_inode_item *inode_item;
1599 u32 mode;
1600
1601 inode_item = btrfs_item_ptr(eb, i,
1602 struct btrfs_inode_item);
1603 mode = btrfs_inode_mode(eb, inode_item);
1604 if (S_ISDIR(mode)) {
1605 ret = replay_dir_deletes(wc->trans,
1606 root, log, path, key.objectid);
1607 BUG_ON(ret);
1608 }
1609 ret = overwrite_item(wc->trans, root, path,
1610 eb, i, &key);
1611 BUG_ON(ret);
1612
1613 /* for regular files, truncate away
1614 * extents past the new EOF
1615 */
1616 if (S_ISREG(mode)) {
1617 inode = read_one_inode(root,
1618 key.objectid);
1619 BUG_ON(!inode);
1620
1621 ret = btrfs_truncate_inode_items(wc->trans,
1622 root, inode, inode->i_size,
1623 BTRFS_EXTENT_DATA_KEY);
1624 BUG_ON(ret);
1625 iput(inode);
1626 }
1627 ret = link_to_fixup_dir(wc->trans, root,
1628 path, key.objectid);
1629 BUG_ON(ret);
1630 }
1631 if (wc->stage < LOG_WALK_REPLAY_ALL)
1632 continue;
1633
1634 /* these keys are simply copied */
1635 if (key.type == BTRFS_XATTR_ITEM_KEY) {
1636 ret = overwrite_item(wc->trans, root, path,
1637 eb, i, &key);
1638 BUG_ON(ret);
1639 } else if (key.type == BTRFS_INODE_REF_KEY) {
1640 ret = add_inode_ref(wc->trans, root, log, path,
1641 eb, i, &key);
1642 BUG_ON(ret && ret != -ENOENT);
1643 } else if (key.type == BTRFS_EXTENT_DATA_KEY) {
1644 ret = replay_one_extent(wc->trans, root, path,
1645 eb, i, &key);
1646 BUG_ON(ret);
1647 } else if (key.type == BTRFS_DIR_ITEM_KEY ||
1648 key.type == BTRFS_DIR_INDEX_KEY) {
1649 ret = replay_one_dir_item(wc->trans, root, path,
1650 eb, i, &key);
1651 BUG_ON(ret);
1652 }
1653 }
1654 btrfs_free_path(path);
1655 return 0;
1656}
1657
1658static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1659 struct btrfs_root *root,
1660 struct btrfs_path *path, int *level,
1661 struct walk_control *wc)
1662{
1663 u64 root_owner;
1664 u64 root_gen;
1665 u64 bytenr;
1666 u64 ptr_gen;
1667 struct extent_buffer *next;
1668 struct extent_buffer *cur;
1669 struct extent_buffer *parent;
1670 u32 blocksize;
1671 int ret = 0;
1672
1673 WARN_ON(*level < 0);
1674 WARN_ON(*level >= BTRFS_MAX_LEVEL);
1675
1676 while (*level > 0) {
1677 WARN_ON(*level < 0);
1678 WARN_ON(*level >= BTRFS_MAX_LEVEL);
1679 cur = path->nodes[*level];
1680
1681 if (btrfs_header_level(cur) != *level)
1682 WARN_ON(1);
1683
1684 if (path->slots[*level] >=
1685 btrfs_header_nritems(cur))
1686 break;
1687
1688 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
1689 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
1690 blocksize = btrfs_level_size(root, *level - 1);
1691
1692 parent = path->nodes[*level];
1693 root_owner = btrfs_header_owner(parent);
1694 root_gen = btrfs_header_generation(parent);
1695
1696 next = btrfs_find_create_tree_block(root, bytenr, blocksize);
1697
1698 wc->process_func(root, next, wc, ptr_gen);
1699
1700 if (*level == 1) {
1701 path->slots[*level]++;
1702 if (wc->free) {
1703 btrfs_read_buffer(next, ptr_gen);
1704
1705 btrfs_tree_lock(next);
1706 clean_tree_block(trans, root, next);
1707 btrfs_wait_tree_block_writeback(next);
1708 btrfs_tree_unlock(next);
1709
1710 ret = btrfs_drop_leaf_ref(trans, root, next);
1711 BUG_ON(ret);
1712
1713 WARN_ON(root_owner !=
1714 BTRFS_TREE_LOG_OBJECTID);
1715 ret = btrfs_free_reserved_extent(root,
1716 bytenr, blocksize);
1717 BUG_ON(ret);
1718 }
1719 free_extent_buffer(next);
1720 continue;
1721 }
1722 btrfs_read_buffer(next, ptr_gen);
1723
1724 WARN_ON(*level <= 0);
1725 if (path->nodes[*level-1])
1726 free_extent_buffer(path->nodes[*level-1]);
1727 path->nodes[*level-1] = next;
1728 *level = btrfs_header_level(next);
1729 path->slots[*level] = 0;
1730 cond_resched();
1731 }
1732 WARN_ON(*level < 0);
1733 WARN_ON(*level >= BTRFS_MAX_LEVEL);
1734
1735 if (path->nodes[*level] == root->node)
1736 parent = path->nodes[*level];
1737 else
1738 parent = path->nodes[*level + 1];
1739
1740 bytenr = path->nodes[*level]->start;
1741
1742 blocksize = btrfs_level_size(root, *level);
1743 root_owner = btrfs_header_owner(parent);
1744 root_gen = btrfs_header_generation(parent);
1745
1746 wc->process_func(root, path->nodes[*level], wc,
1747 btrfs_header_generation(path->nodes[*level]));
1748
1749 if (wc->free) {
1750 next = path->nodes[*level];
1751 btrfs_tree_lock(next);
1752 clean_tree_block(trans, root, next);
1753 btrfs_wait_tree_block_writeback(next);
1754 btrfs_tree_unlock(next);
1755
1756 if (*level == 0) {
1757 ret = btrfs_drop_leaf_ref(trans, root, next);
1758 BUG_ON(ret);
1759 }
1760 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
1761 ret = btrfs_free_reserved_extent(root, bytenr, blocksize);
1762 BUG_ON(ret);
1763 }
1764 free_extent_buffer(path->nodes[*level]);
1765 path->nodes[*level] = NULL;
1766 *level += 1;
1767
1768 cond_resched();
1769 return 0;
1770}
1771
1772static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
1773 struct btrfs_root *root,
1774 struct btrfs_path *path, int *level,
1775 struct walk_control *wc)
1776{
1777 u64 root_owner;
1778 u64 root_gen;
1779 int i;
1780 int slot;
1781 int ret;
1782
1783 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
1784 slot = path->slots[i];
1785 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
1786 struct extent_buffer *node;
1787 node = path->nodes[i];
1788 path->slots[i]++;
1789 *level = i;
1790 WARN_ON(*level == 0);
1791 return 0;
1792 } else {
1793 struct extent_buffer *parent;
1794 if (path->nodes[*level] == root->node)
1795 parent = path->nodes[*level];
1796 else
1797 parent = path->nodes[*level + 1];
1798
1799 root_owner = btrfs_header_owner(parent);
1800 root_gen = btrfs_header_generation(parent);
1801 wc->process_func(root, path->nodes[*level], wc,
1802 btrfs_header_generation(path->nodes[*level]));
1803 if (wc->free) {
1804 struct extent_buffer *next;
1805
1806 next = path->nodes[*level];
1807
1808 btrfs_tree_lock(next);
1809 clean_tree_block(trans, root, next);
1810 btrfs_wait_tree_block_writeback(next);
1811 btrfs_tree_unlock(next);
1812
1813 if (*level == 0) {
1814 ret = btrfs_drop_leaf_ref(trans, root,
1815 next);
1816 BUG_ON(ret);
1817 }
1818
1819 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
1820 ret = btrfs_free_reserved_extent(root,
1821 path->nodes[*level]->start,
1822 path->nodes[*level]->len);
1823 BUG_ON(ret);
1824 }
1825 free_extent_buffer(path->nodes[*level]);
1826 path->nodes[*level] = NULL;
1827 *level = i + 1;
1828 }
1829 }
1830 return 1;
1831}
1832
1833/*
1834 * drop the reference count on the tree rooted at 'snap'. This traverses
1835 * the tree freeing any blocks that have a ref count of zero after being
1836 * decremented.
1837 */
1838static int walk_log_tree(struct btrfs_trans_handle *trans,
1839 struct btrfs_root *log, struct walk_control *wc)
1840{
1841 int ret = 0;
1842 int wret;
1843 int level;
1844 struct btrfs_path *path;
1845 int i;
1846 int orig_level;
1847
1848 path = btrfs_alloc_path();
1849 BUG_ON(!path);
1850
1851 level = btrfs_header_level(log->node);
1852 orig_level = level;
1853 path->nodes[level] = log->node;
1854 extent_buffer_get(log->node);
1855 path->slots[level] = 0;
1856
1857 while (1) {
1858 wret = walk_down_log_tree(trans, log, path, &level, wc);
1859 if (wret > 0)
1860 break;
1861 if (wret < 0)
1862 ret = wret;
1863
1864 wret = walk_up_log_tree(trans, log, path, &level, wc);
1865 if (wret > 0)
1866 break;
1867 if (wret < 0)
1868 ret = wret;
1869 }
1870
1871 /* was the root node processed? if not, catch it here */
1872 if (path->nodes[orig_level]) {
1873 wc->process_func(log, path->nodes[orig_level], wc,
1874 btrfs_header_generation(path->nodes[orig_level]));
1875 if (wc->free) {
1876 struct extent_buffer *next;
1877
1878 next = path->nodes[orig_level];
1879
1880 btrfs_tree_lock(next);
1881 clean_tree_block(trans, log, next);
1882 btrfs_wait_tree_block_writeback(next);
1883 btrfs_tree_unlock(next);
1884
1885 if (orig_level == 0) {
1886 ret = btrfs_drop_leaf_ref(trans, log,
1887 next);
1888 BUG_ON(ret);
1889 }
1890 WARN_ON(log->root_key.objectid !=
1891 BTRFS_TREE_LOG_OBJECTID);
1892 ret = btrfs_free_reserved_extent(log, next->start,
1893 next->len);
1894 BUG_ON(ret);
1895 }
1896 }
1897
1898 for (i = 0; i <= orig_level; i++) {
1899 if (path->nodes[i]) {
1900 free_extent_buffer(path->nodes[i]);
1901 path->nodes[i] = NULL;
1902 }
1903 }
1904 btrfs_free_path(path);
1905 if (wc->free)
1906 free_extent_buffer(log->node);
1907 return ret;
1908}
1909
1910static int wait_log_commit(struct btrfs_root *log)
1911{
1912 DEFINE_WAIT(wait);
1913 u64 transid = log->fs_info->tree_log_transid;
1914
1915 do {
1916 prepare_to_wait(&log->fs_info->tree_log_wait, &wait,
1917 TASK_UNINTERRUPTIBLE);
1918 mutex_unlock(&log->fs_info->tree_log_mutex);
1919 if (atomic_read(&log->fs_info->tree_log_commit))
1920 schedule();
1921 finish_wait(&log->fs_info->tree_log_wait, &wait);
1922 mutex_lock(&log->fs_info->tree_log_mutex);
1923 } while (transid == log->fs_info->tree_log_transid &&
1924 atomic_read(&log->fs_info->tree_log_commit));
1925 return 0;
1926}
1927
1928/*
1929 * btrfs_sync_log does sends a given tree log down to the disk and
1930 * updates the super blocks to record it. When this call is done,
1931 * you know that any inodes previously logged are safely on disk
1932 */
1933int btrfs_sync_log(struct btrfs_trans_handle *trans,
1934 struct btrfs_root *root)
1935{
1936 int ret;
1937 unsigned long batch;
1938 struct btrfs_root *log = root->log_root;
1939
1940 mutex_lock(&log->fs_info->tree_log_mutex);
1941 if (atomic_read(&log->fs_info->tree_log_commit)) {
1942 wait_log_commit(log);
1943 goto out;
1944 }
1945 atomic_set(&log->fs_info->tree_log_commit, 1);
1946
1947 while (1) {
1948 batch = log->fs_info->tree_log_batch;
1949 mutex_unlock(&log->fs_info->tree_log_mutex);
1950 schedule_timeout_uninterruptible(1);
1951 mutex_lock(&log->fs_info->tree_log_mutex);
1952
1953 while (atomic_read(&log->fs_info->tree_log_writers)) {
1954 DEFINE_WAIT(wait);
1955 prepare_to_wait(&log->fs_info->tree_log_wait, &wait,
1956 TASK_UNINTERRUPTIBLE);
1957 mutex_unlock(&log->fs_info->tree_log_mutex);
1958 if (atomic_read(&log->fs_info->tree_log_writers))
1959 schedule();
1960 mutex_lock(&log->fs_info->tree_log_mutex);
1961 finish_wait(&log->fs_info->tree_log_wait, &wait);
1962 }
1963 if (batch == log->fs_info->tree_log_batch)
1964 break;
1965 }
1966
1967 ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
1968 BUG_ON(ret);
1969 ret = btrfs_write_and_wait_marked_extents(root->fs_info->log_root_tree,
1970 &root->fs_info->log_root_tree->dirty_log_pages);
1971 BUG_ON(ret);
1972
1973 btrfs_set_super_log_root(&root->fs_info->super_for_commit,
1974 log->fs_info->log_root_tree->node->start);
1975 btrfs_set_super_log_root_level(&root->fs_info->super_for_commit,
1976 btrfs_header_level(log->fs_info->log_root_tree->node));
1977
1978 write_ctree_super(trans, log->fs_info->tree_root, 2);
1979 log->fs_info->tree_log_transid++;
1980 log->fs_info->tree_log_batch = 0;
1981 atomic_set(&log->fs_info->tree_log_commit, 0);
1982 smp_mb();
1983 if (waitqueue_active(&log->fs_info->tree_log_wait))
1984 wake_up(&log->fs_info->tree_log_wait);
1985out:
1986 mutex_unlock(&log->fs_info->tree_log_mutex);
1987 return 0;
1988}
1989
1990/* * free all the extents used by the tree log. This should be called
1991 * at commit time of the full transaction
1992 */
1993int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
1994{
1995 int ret;
1996 struct btrfs_root *log;
1997 struct key;
1998 u64 start;
1999 u64 end;
2000 struct walk_control wc = {
2001 .free = 1,
2002 .process_func = process_one_buffer
2003 };
2004
2005 if (!root->log_root || root->fs_info->log_root_recovering)
2006 return 0;
2007
2008 log = root->log_root;
2009 ret = walk_log_tree(trans, log, &wc);
2010 BUG_ON(ret);
2011
2012 while (1) {
2013 ret = find_first_extent_bit(&log->dirty_log_pages,
2014 0, &start, &end, EXTENT_DIRTY);
2015 if (ret)
2016 break;
2017
2018 clear_extent_dirty(&log->dirty_log_pages,
2019 start, end, GFP_NOFS);
2020 }
2021
2022 log = root->log_root;
2023 ret = btrfs_del_root(trans, root->fs_info->log_root_tree,
2024 &log->root_key);
2025 BUG_ON(ret);
2026 root->log_root = NULL;
2027 kfree(root->log_root);
2028 return 0;
2029}
2030
2031/*
2032 * helper function to update the item for a given subvolumes log root
2033 * in the tree of log roots
2034 */
2035static int update_log_root(struct btrfs_trans_handle *trans,
2036 struct btrfs_root *log)
2037{
2038 u64 bytenr = btrfs_root_bytenr(&log->root_item);
2039 int ret;
2040
2041 if (log->node->start == bytenr)
2042 return 0;
2043
2044 btrfs_set_root_bytenr(&log->root_item, log->node->start);
2045 btrfs_set_root_generation(&log->root_item, trans->transid);
2046 btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node));
2047 ret = btrfs_update_root(trans, log->fs_info->log_root_tree,
2048 &log->root_key, &log->root_item);
2049 BUG_ON(ret);
2050 return ret;
2051}
2052
2053/*
2054 * If both a file and directory are logged, and unlinks or renames are
2055 * mixed in, we have a few interesting corners:
2056 *
2057 * create file X in dir Y
2058 * link file X to X.link in dir Y
2059 * fsync file X
2060 * unlink file X but leave X.link
2061 * fsync dir Y
2062 *
2063 * After a crash we would expect only X.link to exist. But file X
2064 * didn't get fsync'd again so the log has back refs for X and X.link.
2065 *
2066 * We solve this by removing directory entries and inode backrefs from the
2067 * log when a file that was logged in the current transaction is
2068 * unlinked. Any later fsync will include the updated log entries, and
2069 * we'll be able to reconstruct the proper directory items from backrefs.
2070 *
2071 * This optimizations allows us to avoid relogging the entire inode
2072 * or the entire directory.
2073 */
2074int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2075 struct btrfs_root *root,
2076 const char *name, int name_len,
2077 struct inode *dir, u64 index)
2078{
2079 struct btrfs_root *log;
2080 struct btrfs_dir_item *di;
2081 struct btrfs_path *path;
2082 int ret;
2083 int bytes_del = 0;
2084
2085 if (BTRFS_I(dir)->logged_trans < trans->transid)
2086 return 0;
2087
2088 ret = join_running_log_trans(root);
2089 if (ret)
2090 return 0;
2091
2092 mutex_lock(&BTRFS_I(dir)->log_mutex);
2093
2094 log = root->log_root;
2095 path = btrfs_alloc_path();
2096 di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino,
2097 name, name_len, -1);
2098 if (di && !IS_ERR(di)) {
2099 ret = btrfs_delete_one_dir_name(trans, log, path, di);
2100 bytes_del += name_len;
2101 BUG_ON(ret);
2102 }
2103 btrfs_release_path(log, path);
2104 di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino,
2105 index, name, name_len, -1);
2106 if (di && !IS_ERR(di)) {
2107 ret = btrfs_delete_one_dir_name(trans, log, path, di);
2108 bytes_del += name_len;
2109 BUG_ON(ret);
2110 }
2111
2112 /* update the directory size in the log to reflect the names
2113 * we have removed
2114 */
2115 if (bytes_del) {
2116 struct btrfs_key key;
2117
2118 key.objectid = dir->i_ino;
2119 key.offset = 0;
2120 key.type = BTRFS_INODE_ITEM_KEY;
2121 btrfs_release_path(log, path);
2122
2123 ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
2124 if (ret == 0) {
2125 struct btrfs_inode_item *item;
2126 u64 i_size;
2127
2128 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2129 struct btrfs_inode_item);
2130 i_size = btrfs_inode_size(path->nodes[0], item);
2131 if (i_size > bytes_del)
2132 i_size -= bytes_del;
2133 else
2134 i_size = 0;
2135 btrfs_set_inode_size(path->nodes[0], item, i_size);
2136 btrfs_mark_buffer_dirty(path->nodes[0]);
2137 } else
2138 ret = 0;
2139 btrfs_release_path(log, path);
2140 }
2141
2142 btrfs_free_path(path);
2143 mutex_unlock(&BTRFS_I(dir)->log_mutex);
2144 end_log_trans(root);
2145
2146 return 0;
2147}
2148
2149/* see comments for btrfs_del_dir_entries_in_log */
2150int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
2151 struct btrfs_root *root,
2152 const char *name, int name_len,
2153 struct inode *inode, u64 dirid)
2154{
2155 struct btrfs_root *log;
2156 u64 index;
2157 int ret;
2158
2159 if (BTRFS_I(inode)->logged_trans < trans->transid)
2160 return 0;
2161
2162 ret = join_running_log_trans(root);
2163 if (ret)
2164 return 0;
2165 log = root->log_root;
2166 mutex_lock(&BTRFS_I(inode)->log_mutex);
2167
2168 ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
2169 dirid, &index);
2170 mutex_unlock(&BTRFS_I(inode)->log_mutex);
2171 end_log_trans(root);
2172
2173 return ret;
2174}
2175
2176/*
2177 * creates a range item in the log for 'dirid'. first_offset and
2178 * last_offset tell us which parts of the key space the log should
2179 * be considered authoritative for.
2180 */
2181static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
2182 struct btrfs_root *log,
2183 struct btrfs_path *path,
2184 int key_type, u64 dirid,
2185 u64 first_offset, u64 last_offset)
2186{
2187 int ret;
2188 struct btrfs_key key;
2189 struct btrfs_dir_log_item *item;
2190
2191 key.objectid = dirid;
2192 key.offset = first_offset;
2193 if (key_type == BTRFS_DIR_ITEM_KEY)
2194 key.type = BTRFS_DIR_LOG_ITEM_KEY;
2195 else
2196 key.type = BTRFS_DIR_LOG_INDEX_KEY;
2197 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
2198 BUG_ON(ret);
2199
2200 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2201 struct btrfs_dir_log_item);
2202 btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
2203 btrfs_mark_buffer_dirty(path->nodes[0]);
2204 btrfs_release_path(log, path);
2205 return 0;
2206}
2207
2208/*
2209 * log all the items included in the current transaction for a given
2210 * directory. This also creates the range items in the log tree required
2211 * to replay anything deleted before the fsync
2212 */
2213static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2214 struct btrfs_root *root, struct inode *inode,
2215 struct btrfs_path *path,
2216 struct btrfs_path *dst_path, int key_type,
2217 u64 min_offset, u64 *last_offset_ret)
2218{
2219 struct btrfs_key min_key;
2220 struct btrfs_key max_key;
2221 struct btrfs_root *log = root->log_root;
2222 struct extent_buffer *src;
2223 int ret;
2224 int i;
2225 int nritems;
2226 u64 first_offset = min_offset;
2227 u64 last_offset = (u64)-1;
2228
2229 log = root->log_root;
2230 max_key.objectid = inode->i_ino;
2231 max_key.offset = (u64)-1;
2232 max_key.type = key_type;
2233
2234 min_key.objectid = inode->i_ino;
2235 min_key.type = key_type;
2236 min_key.offset = min_offset;
2237
2238 path->keep_locks = 1;
2239
2240 ret = btrfs_search_forward(root, &min_key, &max_key,
2241 path, 0, trans->transid);
2242
2243 /*
2244 * we didn't find anything from this transaction, see if there
2245 * is anything at all
2246 */
2247 if (ret != 0 || min_key.objectid != inode->i_ino ||
2248 min_key.type != key_type) {
2249 min_key.objectid = inode->i_ino;
2250 min_key.type = key_type;
2251 min_key.offset = (u64)-1;
2252 btrfs_release_path(root, path);
2253 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
2254 if (ret < 0) {
2255 btrfs_release_path(root, path);
2256 return ret;
2257 }
2258 ret = btrfs_previous_item(root, path, inode->i_ino, key_type);
2259
2260 /* if ret == 0 there are items for this type,
2261 * create a range to tell us the last key of this type.
2262 * otherwise, there are no items in this directory after
2263 * *min_offset, and we create a range to indicate that.
2264 */
2265 if (ret == 0) {
2266 struct btrfs_key tmp;
2267 btrfs_item_key_to_cpu(path->nodes[0], &tmp,
2268 path->slots[0]);
2269 if (key_type == tmp.type)
2270 first_offset = max(min_offset, tmp.offset) + 1;
2271 }
2272 goto done;
2273 }
2274
2275 /* go backward to find any previous key */
2276 ret = btrfs_previous_item(root, path, inode->i_ino, key_type);
2277 if (ret == 0) {
2278 struct btrfs_key tmp;
2279 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
2280 if (key_type == tmp.type) {
2281 first_offset = tmp.offset;
2282 ret = overwrite_item(trans, log, dst_path,
2283 path->nodes[0], path->slots[0],
2284 &tmp);
2285 }
2286 }
2287 btrfs_release_path(root, path);
2288
2289 /* find the first key from this transaction again */
2290 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
2291 if (ret != 0) {
2292 WARN_ON(1);
2293 goto done;
2294 }
2295
2296 /*
2297 * we have a block from this transaction, log every item in it
2298 * from our directory
2299 */
2300 while (1) {
2301 struct btrfs_key tmp;
2302 src = path->nodes[0];
2303 nritems = btrfs_header_nritems(src);
2304 for (i = path->slots[0]; i < nritems; i++) {
2305 btrfs_item_key_to_cpu(src, &min_key, i);
2306
2307 if (min_key.objectid != inode->i_ino ||
2308 min_key.type != key_type)
2309 goto done;
2310 ret = overwrite_item(trans, log, dst_path, src, i,
2311 &min_key);
2312 BUG_ON(ret);
2313 }
2314 path->slots[0] = nritems;
2315
2316 /*
2317 * look ahead to the next item and see if it is also
2318 * from this directory and from this transaction
2319 */
2320 ret = btrfs_next_leaf(root, path);
2321 if (ret == 1) {
2322 last_offset = (u64)-1;
2323 goto done;
2324 }
2325 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
2326 if (tmp.objectid != inode->i_ino || tmp.type != key_type) {
2327 last_offset = (u64)-1;
2328 goto done;
2329 }
2330 if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
2331 ret = overwrite_item(trans, log, dst_path,
2332 path->nodes[0], path->slots[0],
2333 &tmp);
2334
2335 BUG_ON(ret);
2336 last_offset = tmp.offset;
2337 goto done;
2338 }
2339 }
2340done:
2341 *last_offset_ret = last_offset;
2342 btrfs_release_path(root, path);
2343 btrfs_release_path(log, dst_path);
2344
2345 /* insert the log range keys to indicate where the log is valid */
2346 ret = insert_dir_log_key(trans, log, path, key_type, inode->i_ino,
2347 first_offset, last_offset);
2348 BUG_ON(ret);
2349 return 0;
2350}
2351
2352/*
2353 * logging directories is very similar to logging inodes, We find all the items
2354 * from the current transaction and write them to the log.
2355 *
2356 * The recovery code scans the directory in the subvolume, and if it finds a
2357 * key in the range logged that is not present in the log tree, then it means
2358 * that dir entry was unlinked during the transaction.
2359 *
2360 * In order for that scan to work, we must include one key smaller than
2361 * the smallest logged by this transaction and one key larger than the largest
2362 * key logged by this transaction.
2363 */
2364static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
2365 struct btrfs_root *root, struct inode *inode,
2366 struct btrfs_path *path,
2367 struct btrfs_path *dst_path)
2368{
2369 u64 min_key;
2370 u64 max_key;
2371 int ret;
2372 int key_type = BTRFS_DIR_ITEM_KEY;
2373
2374again:
2375 min_key = 0;
2376 max_key = 0;
2377 while (1) {
2378 ret = log_dir_items(trans, root, inode, path,
2379 dst_path, key_type, min_key,
2380 &max_key);
2381 BUG_ON(ret);
2382 if (max_key == (u64)-1)
2383 break;
2384 min_key = max_key + 1;
2385 }
2386
2387 if (key_type == BTRFS_DIR_ITEM_KEY) {
2388 key_type = BTRFS_DIR_INDEX_KEY;
2389 goto again;
2390 }
2391 return 0;
2392}
2393
2394/*
2395 * a helper function to drop items from the log before we relog an
2396 * inode. max_key_type indicates the highest item type to remove.
2397 * This cannot be run for file data extents because it does not
2398 * free the extents they point to.
2399 */
2400static int drop_objectid_items(struct btrfs_trans_handle *trans,
2401 struct btrfs_root *log,
2402 struct btrfs_path *path,
2403 u64 objectid, int max_key_type)
2404{
2405 int ret;
2406 struct btrfs_key key;
2407 struct btrfs_key found_key;
2408
2409 key.objectid = objectid;
2410 key.type = max_key_type;
2411 key.offset = (u64)-1;
2412
2413 while (1) {
2414 ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
2415
2416 if (ret != 1)
2417 break;
2418
2419 if (path->slots[0] == 0)
2420 break;
2421
2422 path->slots[0]--;
2423 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2424 path->slots[0]);
2425
2426 if (found_key.objectid != objectid)
2427 break;
2428
2429 ret = btrfs_del_item(trans, log, path);
2430 BUG_ON(ret);
2431 btrfs_release_path(log, path);
2432 }
2433 btrfs_release_path(log, path);
2434 return 0;
2435}
2436
2437static noinline int copy_items(struct btrfs_trans_handle *trans,
2438 struct btrfs_root *log,
2439 struct btrfs_path *dst_path,
2440 struct extent_buffer *src,
2441 int start_slot, int nr, int inode_only)
2442{
2443 unsigned long src_offset;
2444 unsigned long dst_offset;
2445 struct btrfs_file_extent_item *extent;
2446 struct btrfs_inode_item *inode_item;
2447 int ret;
2448 struct btrfs_key *ins_keys;
2449 u32 *ins_sizes;
2450 char *ins_data;
2451 int i;
2452 struct list_head ordered_sums;
2453
2454 INIT_LIST_HEAD(&ordered_sums);
2455
2456 ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
2457 nr * sizeof(u32), GFP_NOFS);
2458 ins_sizes = (u32 *)ins_data;
2459 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
2460
2461 for (i = 0; i < nr; i++) {
2462 ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
2463 btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
2464 }
2465 ret = btrfs_insert_empty_items(trans, log, dst_path,
2466 ins_keys, ins_sizes, nr);
2467 BUG_ON(ret);
2468
2469 for (i = 0; i < nr; i++) {
2470 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
2471 dst_path->slots[0]);
2472
2473 src_offset = btrfs_item_ptr_offset(src, start_slot + i);
2474
2475 copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
2476 src_offset, ins_sizes[i]);
2477
2478 if (inode_only == LOG_INODE_EXISTS &&
2479 ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
2480 inode_item = btrfs_item_ptr(dst_path->nodes[0],
2481 dst_path->slots[0],
2482 struct btrfs_inode_item);
2483 btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0);
2484
2485 /* set the generation to zero so the recover code
2486 * can tell the difference between an logging
2487 * just to say 'this inode exists' and a logging
2488 * to say 'update this inode with these values'
2489 */
2490 btrfs_set_inode_generation(dst_path->nodes[0],
2491 inode_item, 0);
2492 }
2493 /* take a reference on file data extents so that truncates
2494 * or deletes of this inode don't have to relog the inode
2495 * again
2496 */
2497 if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY) {
2498 int found_type;
2499 extent = btrfs_item_ptr(src, start_slot + i,
2500 struct btrfs_file_extent_item);
2501
2502 found_type = btrfs_file_extent_type(src, extent);
2503 if (found_type == BTRFS_FILE_EXTENT_REG ||
2504 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
2505 u64 ds = btrfs_file_extent_disk_bytenr(src,
2506 extent);
2507 u64 dl = btrfs_file_extent_disk_num_bytes(src,
2508 extent);
2509 u64 cs = btrfs_file_extent_offset(src, extent);
2510 u64 cl = btrfs_file_extent_num_bytes(src,
2511 extent);;
2512 if (btrfs_file_extent_compression(src,
2513 extent)) {
2514 cs = 0;
2515 cl = dl;
2516 }
2517 /* ds == 0 is a hole */
2518 if (ds != 0) {
2519 ret = btrfs_inc_extent_ref(trans, log,
2520 ds, dl,
2521 dst_path->nodes[0]->start,
2522 BTRFS_TREE_LOG_OBJECTID,
2523 trans->transid,
2524 ins_keys[i].objectid);
2525 BUG_ON(ret);
2526 ret = btrfs_lookup_csums_range(
2527 log->fs_info->csum_root,
2528 ds + cs, ds + cs + cl - 1,
2529 &ordered_sums);
2530 BUG_ON(ret);
2531 }
2532 }
2533 }
2534 dst_path->slots[0]++;
2535 }
2536
2537 btrfs_mark_buffer_dirty(dst_path->nodes[0]);
2538 btrfs_release_path(log, dst_path);
2539 kfree(ins_data);
2540
2541 /*
2542 * we have to do this after the loop above to avoid changing the
2543 * log tree while trying to change the log tree.
2544 */
2545 while (!list_empty(&ordered_sums)) {
2546 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
2547 struct btrfs_ordered_sum,
2548 list);
2549 ret = btrfs_csum_file_blocks(trans, log, sums);
2550 BUG_ON(ret);
2551 list_del(&sums->list);
2552 kfree(sums);
2553 }
2554 return 0;
2555}
2556
2557/* log a single inode in the tree log.
2558 * At least one parent directory for this inode must exist in the tree
2559 * or be logged already.
2560 *
2561 * Any items from this inode changed by the current transaction are copied
2562 * to the log tree. An extra reference is taken on any extents in this
2563 * file, allowing us to avoid a whole pile of corner cases around logging
2564 * blocks that have been removed from the tree.
2565 *
2566 * See LOG_INODE_ALL and related defines for a description of what inode_only
2567 * does.
2568 *
2569 * This handles both files and directories.
2570 */
2571static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
2572 struct btrfs_root *root, struct inode *inode,
2573 int inode_only)
2574{
2575 struct btrfs_path *path;
2576 struct btrfs_path *dst_path;
2577 struct btrfs_key min_key;
2578 struct btrfs_key max_key;
2579 struct btrfs_root *log = root->log_root;
2580 struct extent_buffer *src = NULL;
2581 u32 size;
2582 int ret;
2583 int nritems;
2584 int ins_start_slot = 0;
2585 int ins_nr;
2586
2587 log = root->log_root;
2588
2589 path = btrfs_alloc_path();
2590 dst_path = btrfs_alloc_path();
2591
2592 min_key.objectid = inode->i_ino;
2593 min_key.type = BTRFS_INODE_ITEM_KEY;
2594 min_key.offset = 0;
2595
2596 max_key.objectid = inode->i_ino;
2597 if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode))
2598 max_key.type = BTRFS_XATTR_ITEM_KEY;
2599 else
2600 max_key.type = (u8)-1;
2601 max_key.offset = (u64)-1;
2602
2603 /*
2604 * if this inode has already been logged and we're in inode_only
2605 * mode, we don't want to delete the things that have already
2606 * been written to the log.
2607 *
2608 * But, if the inode has been through an inode_only log,
2609 * the logged_trans field is not set. This allows us to catch
2610 * any new names for this inode in the backrefs by logging it
2611 * again
2612 */
2613 if (inode_only == LOG_INODE_EXISTS &&
2614 BTRFS_I(inode)->logged_trans == trans->transid) {
2615 btrfs_free_path(path);
2616 btrfs_free_path(dst_path);
2617 goto out;
2618 }
2619 mutex_lock(&BTRFS_I(inode)->log_mutex);
2620
2621 /*
2622 * a brute force approach to making sure we get the most uptodate
2623 * copies of everything.
2624 */
2625 if (S_ISDIR(inode->i_mode)) {
2626 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
2627
2628 if (inode_only == LOG_INODE_EXISTS)
2629 max_key_type = BTRFS_XATTR_ITEM_KEY;
2630 ret = drop_objectid_items(trans, log, path,
2631 inode->i_ino, max_key_type);
2632 } else {
2633 ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0);
2634 }
2635 BUG_ON(ret);
2636 path->keep_locks = 1;
2637
2638 while (1) {
2639 ins_nr = 0;
2640 ret = btrfs_search_forward(root, &min_key, &max_key,
2641 path, 0, trans->transid);
2642 if (ret != 0)
2643 break;
2644again:
2645 /* note, ins_nr might be > 0 here, cleanup outside the loop */
2646 if (min_key.objectid != inode->i_ino)
2647 break;
2648 if (min_key.type > max_key.type)
2649 break;
2650
2651 src = path->nodes[0];
2652 size = btrfs_item_size_nr(src, path->slots[0]);
2653 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
2654 ins_nr++;
2655 goto next_slot;
2656 } else if (!ins_nr) {
2657 ins_start_slot = path->slots[0];
2658 ins_nr = 1;
2659 goto next_slot;
2660 }
2661
2662 ret = copy_items(trans, log, dst_path, src, ins_start_slot,
2663 ins_nr, inode_only);
2664 BUG_ON(ret);
2665 ins_nr = 1;
2666 ins_start_slot = path->slots[0];
2667next_slot:
2668
2669 nritems = btrfs_header_nritems(path->nodes[0]);
2670 path->slots[0]++;
2671 if (path->slots[0] < nritems) {
2672 btrfs_item_key_to_cpu(path->nodes[0], &min_key,
2673 path->slots[0]);
2674 goto again;
2675 }
2676 if (ins_nr) {
2677 ret = copy_items(trans, log, dst_path, src,
2678 ins_start_slot,
2679 ins_nr, inode_only);
2680 BUG_ON(ret);
2681 ins_nr = 0;
2682 }
2683 btrfs_release_path(root, path);
2684
2685 if (min_key.offset < (u64)-1)
2686 min_key.offset++;
2687 else if (min_key.type < (u8)-1)
2688 min_key.type++;
2689 else if (min_key.objectid < (u64)-1)
2690 min_key.objectid++;
2691 else
2692 break;
2693 }
2694 if (ins_nr) {
2695 ret = copy_items(trans, log, dst_path, src,
2696 ins_start_slot,
2697 ins_nr, inode_only);
2698 BUG_ON(ret);
2699 ins_nr = 0;
2700 }
2701 WARN_ON(ins_nr);
2702 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
2703 btrfs_release_path(root, path);
2704 btrfs_release_path(log, dst_path);
2705 BTRFS_I(inode)->log_dirty_trans = 0;
2706 ret = log_directory_changes(trans, root, inode, path, dst_path);
2707 BUG_ON(ret);
2708 }
2709 BTRFS_I(inode)->logged_trans = trans->transid;
2710 mutex_unlock(&BTRFS_I(inode)->log_mutex);
2711
2712 btrfs_free_path(path);
2713 btrfs_free_path(dst_path);
2714
2715 mutex_lock(&root->fs_info->tree_log_mutex);
2716 ret = update_log_root(trans, log);
2717 BUG_ON(ret);
2718 mutex_unlock(&root->fs_info->tree_log_mutex);
2719out:
2720 return 0;
2721}
2722
2723int btrfs_log_inode(struct btrfs_trans_handle *trans,
2724 struct btrfs_root *root, struct inode *inode,
2725 int inode_only)
2726{
2727 int ret;
2728
2729 start_log_trans(trans, root);
2730 ret = __btrfs_log_inode(trans, root, inode, inode_only);
2731 end_log_trans(root);
2732 return ret;
2733}
2734
2735/*
2736 * helper function around btrfs_log_inode to make sure newly created
2737 * parent directories also end up in the log. A minimal inode and backref
2738 * only logging is done of any parent directories that are older than
2739 * the last committed transaction
2740 */
2741int btrfs_log_dentry(struct btrfs_trans_handle *trans,
2742 struct btrfs_root *root, struct dentry *dentry)
2743{
2744 int inode_only = LOG_INODE_ALL;
2745 struct super_block *sb;
2746 int ret;
2747
2748 start_log_trans(trans, root);
2749 sb = dentry->d_inode->i_sb;
2750 while (1) {
2751 ret = __btrfs_log_inode(trans, root, dentry->d_inode,
2752 inode_only);
2753 BUG_ON(ret);
2754 inode_only = LOG_INODE_EXISTS;
2755
2756 dentry = dentry->d_parent;
2757 if (!dentry || !dentry->d_inode || sb != dentry->d_inode->i_sb)
2758 break;
2759
2760 if (BTRFS_I(dentry->d_inode)->generation <=
2761 root->fs_info->last_trans_committed)
2762 break;
2763 }
2764 end_log_trans(root);
2765 return 0;
2766}
2767
2768/*
2769 * it is not safe to log dentry if the chunk root has added new
2770 * chunks. This returns 0 if the dentry was logged, and 1 otherwise.
2771 * If this returns 1, you must commit the transaction to safely get your
2772 * data on disk.
2773 */
2774int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
2775 struct btrfs_root *root, struct dentry *dentry)
2776{
2777 u64 gen;
2778 gen = root->fs_info->last_trans_new_blockgroup;
2779 if (gen > root->fs_info->last_trans_committed)
2780 return 1;
2781 else
2782 return btrfs_log_dentry(trans, root, dentry);
2783}
2784
2785/*
2786 * should be called during mount to recover any replay any log trees
2787 * from the FS
2788 */
2789int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
2790{
2791 int ret;
2792 struct btrfs_path *path;
2793 struct btrfs_trans_handle *trans;
2794 struct btrfs_key key;
2795 struct btrfs_key found_key;
2796 struct btrfs_key tmp_key;
2797 struct btrfs_root *log;
2798 struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
2799 u64 highest_inode;
2800 struct walk_control wc = {
2801 .process_func = process_one_buffer,
2802 .stage = 0,
2803 };
2804
2805 fs_info->log_root_recovering = 1;
2806 path = btrfs_alloc_path();
2807 BUG_ON(!path);
2808
2809 trans = btrfs_start_transaction(fs_info->tree_root, 1);
2810
2811 wc.trans = trans;
2812 wc.pin = 1;
2813
2814 walk_log_tree(trans, log_root_tree, &wc);
2815
2816again:
2817 key.objectid = BTRFS_TREE_LOG_OBJECTID;
2818 key.offset = (u64)-1;
2819 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
2820
2821 while (1) {
2822 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
2823 if (ret < 0)
2824 break;
2825 if (ret > 0) {
2826 if (path->slots[0] == 0)
2827 break;
2828 path->slots[0]--;
2829 }
2830 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2831 path->slots[0]);
2832 btrfs_release_path(log_root_tree, path);
2833 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
2834 break;
2835
2836 log = btrfs_read_fs_root_no_radix(log_root_tree,
2837 &found_key);
2838 BUG_ON(!log);
2839
2840
2841 tmp_key.objectid = found_key.offset;
2842 tmp_key.type = BTRFS_ROOT_ITEM_KEY;
2843 tmp_key.offset = (u64)-1;
2844
2845 wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
2846 BUG_ON(!wc.replay_dest);
2847
2848 wc.replay_dest->log_root = log;
2849 btrfs_record_root_in_trans(wc.replay_dest);
2850 ret = walk_log_tree(trans, log, &wc);
2851 BUG_ON(ret);
2852
2853 if (wc.stage == LOG_WALK_REPLAY_ALL) {
2854 ret = fixup_inode_link_counts(trans, wc.replay_dest,
2855 path);
2856 BUG_ON(ret);
2857 }
2858 ret = btrfs_find_highest_inode(wc.replay_dest, &highest_inode);
2859 if (ret == 0) {
2860 wc.replay_dest->highest_inode = highest_inode;
2861 wc.replay_dest->last_inode_alloc = highest_inode;
2862 }
2863
2864 key.offset = found_key.offset - 1;
2865 wc.replay_dest->log_root = NULL;
2866 free_extent_buffer(log->node);
2867 kfree(log);
2868
2869 if (found_key.offset == 0)
2870 break;
2871 }
2872 btrfs_release_path(log_root_tree, path);
2873
2874 /* step one is to pin it all, step two is to replay just inodes */
2875 if (wc.pin) {
2876 wc.pin = 0;
2877 wc.process_func = replay_one_buffer;
2878 wc.stage = LOG_WALK_REPLAY_INODES;
2879 goto again;
2880 }
2881 /* step three is to replay everything */
2882 if (wc.stage < LOG_WALK_REPLAY_ALL) {
2883 wc.stage++;
2884 goto again;
2885 }
2886
2887 btrfs_free_path(path);
2888
2889 free_extent_buffer(log_root_tree->node);
2890 log_root_tree->log_root = NULL;
2891 fs_info->log_root_recovering = 0;
2892
2893 /* step 4: commit the transaction, which also unpins the blocks */
2894 btrfs_commit_transaction(trans, fs_info->tree_root);
2895
2896 kfree(log_root_tree);
2897 return 0;
2898}
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
new file mode 100644
index 000000000000..b9409b32ed02
--- /dev/null
+++ b/fs/btrfs/tree-log.h
@@ -0,0 +1,41 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __TREE_LOG_
20#define __TREE_LOG_
21
22int btrfs_sync_log(struct btrfs_trans_handle *trans,
23 struct btrfs_root *root);
24int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
25int btrfs_log_dentry(struct btrfs_trans_handle *trans,
26 struct btrfs_root *root, struct dentry *dentry);
27int btrfs_recover_log_trees(struct btrfs_root *tree_root);
28int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
29 struct btrfs_root *root, struct dentry *dentry);
30int btrfs_log_inode(struct btrfs_trans_handle *trans,
31 struct btrfs_root *root, struct inode *inode,
32 int inode_only);
33int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
34 struct btrfs_root *root,
35 const char *name, int name_len,
36 struct inode *dir, u64 index);
37int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
38 struct btrfs_root *root,
39 const char *name, int name_len,
40 struct inode *inode, u64 dirid);
41#endif
diff --git a/fs/btrfs/version.h b/fs/btrfs/version.h
new file mode 100644
index 000000000000..9bf3946d5ef2
--- /dev/null
+++ b/fs/btrfs/version.h
@@ -0,0 +1,4 @@
1#ifndef __BTRFS_VERSION_H
2#define __BTRFS_VERSION_H
3#define BTRFS_BUILD_VERSION "Btrfs"
4#endif
diff --git a/fs/btrfs/version.sh b/fs/btrfs/version.sh
new file mode 100644
index 000000000000..1ca1952fd917
--- /dev/null
+++ b/fs/btrfs/version.sh
@@ -0,0 +1,43 @@
1#!/bin/bash
2#
3# determine-version -- report a useful version for releases
4#
5# Copyright 2008, Aron Griffis <agriffis@n01se.net>
6# Copyright 2008, Oracle
7# Released under the GNU GPLv2
8
9v="v0.16"
10
11which git &> /dev/null
12if [ $? == 0 ]; then
13 git branch >& /dev/null
14 if [ $? == 0 ]; then
15 if head=`git rev-parse --verify HEAD 2>/dev/null`; then
16 if tag=`git describe --tags 2>/dev/null`; then
17 v="$tag"
18 fi
19
20 # Are there uncommitted changes?
21 git update-index --refresh --unmerged > /dev/null
22 if git diff-index --name-only HEAD | \
23 grep -v "^scripts/package" \
24 | read dummy; then
25 v="$v"-dirty
26 fi
27 fi
28 fi
29fi
30
31echo "#ifndef __BUILD_VERSION" > .build-version.h
32echo "#define __BUILD_VERSION" >> .build-version.h
33echo "#define BTRFS_BUILD_VERSION \"Btrfs $v\"" >> .build-version.h
34echo "#endif" >> .build-version.h
35
36diff -q version.h .build-version.h >& /dev/null
37
38if [ $? == 0 ]; then
39 rm .build-version.h
40 exit 0
41fi
42
43mv .build-version.h version.h
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
new file mode 100644
index 000000000000..3451e1cca2b5
--- /dev/null
+++ b/fs/btrfs/volumes.c
@@ -0,0 +1,3219 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18#include <linux/sched.h>
19#include <linux/bio.h>
20#include <linux/buffer_head.h>
21#include <linux/blkdev.h>
22#include <linux/random.h>
23#include <linux/version.h>
24#include <asm/div64.h>
25#include "compat.h"
26#include "ctree.h"
27#include "extent_map.h"
28#include "disk-io.h"
29#include "transaction.h"
30#include "print-tree.h"
31#include "volumes.h"
32#include "async-thread.h"
33
34struct map_lookup {
35 u64 type;
36 int io_align;
37 int io_width;
38 int stripe_len;
39 int sector_size;
40 int num_stripes;
41 int sub_stripes;
42 struct btrfs_bio_stripe stripes[];
43};
44
45static int init_first_rw_device(struct btrfs_trans_handle *trans,
46 struct btrfs_root *root,
47 struct btrfs_device *device);
48static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
49
50#define map_lookup_size(n) (sizeof(struct map_lookup) + \
51 (sizeof(struct btrfs_bio_stripe) * (n)))
52
53static DEFINE_MUTEX(uuid_mutex);
54static LIST_HEAD(fs_uuids);
55
56void btrfs_lock_volumes(void)
57{
58 mutex_lock(&uuid_mutex);
59}
60
61void btrfs_unlock_volumes(void)
62{
63 mutex_unlock(&uuid_mutex);
64}
65
66static void lock_chunks(struct btrfs_root *root)
67{
68 mutex_lock(&root->fs_info->chunk_mutex);
69}
70
71static void unlock_chunks(struct btrfs_root *root)
72{
73 mutex_unlock(&root->fs_info->chunk_mutex);
74}
75
76static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
77{
78 struct btrfs_device *device;
79 WARN_ON(fs_devices->opened);
80 while (!list_empty(&fs_devices->devices)) {
81 device = list_entry(fs_devices->devices.next,
82 struct btrfs_device, dev_list);
83 list_del(&device->dev_list);
84 kfree(device->name);
85 kfree(device);
86 }
87 kfree(fs_devices);
88}
89
90int btrfs_cleanup_fs_uuids(void)
91{
92 struct btrfs_fs_devices *fs_devices;
93
94 while (!list_empty(&fs_uuids)) {
95 fs_devices = list_entry(fs_uuids.next,
96 struct btrfs_fs_devices, list);
97 list_del(&fs_devices->list);
98 free_fs_devices(fs_devices);
99 }
100 return 0;
101}
102
103static noinline struct btrfs_device *__find_device(struct list_head *head,
104 u64 devid, u8 *uuid)
105{
106 struct btrfs_device *dev;
107 struct list_head *cur;
108
109 list_for_each(cur, head) {
110 dev = list_entry(cur, struct btrfs_device, dev_list);
111 if (dev->devid == devid &&
112 (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
113 return dev;
114 }
115 }
116 return NULL;
117}
118
119static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
120{
121 struct list_head *cur;
122 struct btrfs_fs_devices *fs_devices;
123
124 list_for_each(cur, &fs_uuids) {
125 fs_devices = list_entry(cur, struct btrfs_fs_devices, list);
126 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
127 return fs_devices;
128 }
129 return NULL;
130}
131
132/*
133 * we try to collect pending bios for a device so we don't get a large
134 * number of procs sending bios down to the same device. This greatly
135 * improves the schedulers ability to collect and merge the bios.
136 *
137 * But, it also turns into a long list of bios to process and that is sure
138 * to eventually make the worker thread block. The solution here is to
139 * make some progress and then put this work struct back at the end of
140 * the list if the block device is congested. This way, multiple devices
141 * can make progress from a single worker thread.
142 */
143static noinline int run_scheduled_bios(struct btrfs_device *device)
144{
145 struct bio *pending;
146 struct backing_dev_info *bdi;
147 struct btrfs_fs_info *fs_info;
148 struct bio *tail;
149 struct bio *cur;
150 int again = 0;
151 unsigned long num_run = 0;
152 unsigned long limit;
153
154 bdi = device->bdev->bd_inode->i_mapping->backing_dev_info;
155 fs_info = device->dev_root->fs_info;
156 limit = btrfs_async_submit_limit(fs_info);
157 limit = limit * 2 / 3;
158
159loop:
160 spin_lock(&device->io_lock);
161
162 /* take all the bios off the list at once and process them
163 * later on (without the lock held). But, remember the
164 * tail and other pointers so the bios can be properly reinserted
165 * into the list if we hit congestion
166 */
167 pending = device->pending_bios;
168 tail = device->pending_bio_tail;
169 WARN_ON(pending && !tail);
170 device->pending_bios = NULL;
171 device->pending_bio_tail = NULL;
172
173 /*
174 * if pending was null this time around, no bios need processing
175 * at all and we can stop. Otherwise it'll loop back up again
176 * and do an additional check so no bios are missed.
177 *
178 * device->running_pending is used to synchronize with the
179 * schedule_bio code.
180 */
181 if (pending) {
182 again = 1;
183 device->running_pending = 1;
184 } else {
185 again = 0;
186 device->running_pending = 0;
187 }
188 spin_unlock(&device->io_lock);
189
190 while (pending) {
191 cur = pending;
192 pending = pending->bi_next;
193 cur->bi_next = NULL;
194 atomic_dec(&fs_info->nr_async_bios);
195
196 if (atomic_read(&fs_info->nr_async_bios) < limit &&
197 waitqueue_active(&fs_info->async_submit_wait))
198 wake_up(&fs_info->async_submit_wait);
199
200 BUG_ON(atomic_read(&cur->bi_cnt) == 0);
201 bio_get(cur);
202 submit_bio(cur->bi_rw, cur);
203 bio_put(cur);
204 num_run++;
205
206 /*
207 * we made progress, there is more work to do and the bdi
208 * is now congested. Back off and let other work structs
209 * run instead
210 */
211 if (pending && bdi_write_congested(bdi) &&
212 fs_info->fs_devices->open_devices > 1) {
213 struct bio *old_head;
214
215 spin_lock(&device->io_lock);
216
217 old_head = device->pending_bios;
218 device->pending_bios = pending;
219 if (device->pending_bio_tail)
220 tail->bi_next = old_head;
221 else
222 device->pending_bio_tail = tail;
223 device->running_pending = 0;
224
225 spin_unlock(&device->io_lock);
226 btrfs_requeue_work(&device->work);
227 goto done;
228 }
229 }
230 if (again)
231 goto loop;
232done:
233 return 0;
234}
235
236static void pending_bios_fn(struct btrfs_work *work)
237{
238 struct btrfs_device *device;
239
240 device = container_of(work, struct btrfs_device, work);
241 run_scheduled_bios(device);
242}
243
244static noinline int device_list_add(const char *path,
245 struct btrfs_super_block *disk_super,
246 u64 devid, struct btrfs_fs_devices **fs_devices_ret)
247{
248 struct btrfs_device *device;
249 struct btrfs_fs_devices *fs_devices;
250 u64 found_transid = btrfs_super_generation(disk_super);
251
252 fs_devices = find_fsid(disk_super->fsid);
253 if (!fs_devices) {
254 fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
255 if (!fs_devices)
256 return -ENOMEM;
257 INIT_LIST_HEAD(&fs_devices->devices);
258 INIT_LIST_HEAD(&fs_devices->alloc_list);
259 list_add(&fs_devices->list, &fs_uuids);
260 memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
261 fs_devices->latest_devid = devid;
262 fs_devices->latest_trans = found_transid;
263 device = NULL;
264 } else {
265 device = __find_device(&fs_devices->devices, devid,
266 disk_super->dev_item.uuid);
267 }
268 if (!device) {
269 if (fs_devices->opened)
270 return -EBUSY;
271
272 device = kzalloc(sizeof(*device), GFP_NOFS);
273 if (!device) {
274 /* we can safely leave the fs_devices entry around */
275 return -ENOMEM;
276 }
277 device->devid = devid;
278 device->work.func = pending_bios_fn;
279 memcpy(device->uuid, disk_super->dev_item.uuid,
280 BTRFS_UUID_SIZE);
281 device->barriers = 1;
282 spin_lock_init(&device->io_lock);
283 device->name = kstrdup(path, GFP_NOFS);
284 if (!device->name) {
285 kfree(device);
286 return -ENOMEM;
287 }
288 INIT_LIST_HEAD(&device->dev_alloc_list);
289 list_add(&device->dev_list, &fs_devices->devices);
290 device->fs_devices = fs_devices;
291 fs_devices->num_devices++;
292 }
293
294 if (found_transid > fs_devices->latest_trans) {
295 fs_devices->latest_devid = devid;
296 fs_devices->latest_trans = found_transid;
297 }
298 *fs_devices_ret = fs_devices;
299 return 0;
300}
301
302static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
303{
304 struct btrfs_fs_devices *fs_devices;
305 struct btrfs_device *device;
306 struct btrfs_device *orig_dev;
307
308 fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
309 if (!fs_devices)
310 return ERR_PTR(-ENOMEM);
311
312 INIT_LIST_HEAD(&fs_devices->devices);
313 INIT_LIST_HEAD(&fs_devices->alloc_list);
314 INIT_LIST_HEAD(&fs_devices->list);
315 fs_devices->latest_devid = orig->latest_devid;
316 fs_devices->latest_trans = orig->latest_trans;
317 memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid));
318
319 list_for_each_entry(orig_dev, &orig->devices, dev_list) {
320 device = kzalloc(sizeof(*device), GFP_NOFS);
321 if (!device)
322 goto error;
323
324 device->name = kstrdup(orig_dev->name, GFP_NOFS);
325 if (!device->name)
326 goto error;
327
328 device->devid = orig_dev->devid;
329 device->work.func = pending_bios_fn;
330 memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid));
331 device->barriers = 1;
332 spin_lock_init(&device->io_lock);
333 INIT_LIST_HEAD(&device->dev_list);
334 INIT_LIST_HEAD(&device->dev_alloc_list);
335
336 list_add(&device->dev_list, &fs_devices->devices);
337 device->fs_devices = fs_devices;
338 fs_devices->num_devices++;
339 }
340 return fs_devices;
341error:
342 free_fs_devices(fs_devices);
343 return ERR_PTR(-ENOMEM);
344}
345
346int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
347{
348 struct list_head *tmp;
349 struct list_head *cur;
350 struct btrfs_device *device;
351
352 mutex_lock(&uuid_mutex);
353again:
354 list_for_each_safe(cur, tmp, &fs_devices->devices) {
355 device = list_entry(cur, struct btrfs_device, dev_list);
356 if (device->in_fs_metadata)
357 continue;
358
359 if (device->bdev) {
360 close_bdev_exclusive(device->bdev, device->mode);
361 device->bdev = NULL;
362 fs_devices->open_devices--;
363 }
364 if (device->writeable) {
365 list_del_init(&device->dev_alloc_list);
366 device->writeable = 0;
367 fs_devices->rw_devices--;
368 }
369 list_del_init(&device->dev_list);
370 fs_devices->num_devices--;
371 kfree(device->name);
372 kfree(device);
373 }
374
375 if (fs_devices->seed) {
376 fs_devices = fs_devices->seed;
377 goto again;
378 }
379
380 mutex_unlock(&uuid_mutex);
381 return 0;
382}
383
384static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
385{
386 struct list_head *cur;
387 struct btrfs_device *device;
388
389 if (--fs_devices->opened > 0)
390 return 0;
391
392 list_for_each(cur, &fs_devices->devices) {
393 device = list_entry(cur, struct btrfs_device, dev_list);
394 if (device->bdev) {
395 close_bdev_exclusive(device->bdev, device->mode);
396 fs_devices->open_devices--;
397 }
398 if (device->writeable) {
399 list_del_init(&device->dev_alloc_list);
400 fs_devices->rw_devices--;
401 }
402
403 device->bdev = NULL;
404 device->writeable = 0;
405 device->in_fs_metadata = 0;
406 }
407 WARN_ON(fs_devices->open_devices);
408 WARN_ON(fs_devices->rw_devices);
409 fs_devices->opened = 0;
410 fs_devices->seeding = 0;
411
412 return 0;
413}
414
415int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
416{
417 struct btrfs_fs_devices *seed_devices = NULL;
418 int ret;
419
420 mutex_lock(&uuid_mutex);
421 ret = __btrfs_close_devices(fs_devices);
422 if (!fs_devices->opened) {
423 seed_devices = fs_devices->seed;
424 fs_devices->seed = NULL;
425 }
426 mutex_unlock(&uuid_mutex);
427
428 while (seed_devices) {
429 fs_devices = seed_devices;
430 seed_devices = fs_devices->seed;
431 __btrfs_close_devices(fs_devices);
432 free_fs_devices(fs_devices);
433 }
434 return ret;
435}
436
437static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
438 fmode_t flags, void *holder)
439{
440 struct block_device *bdev;
441 struct list_head *head = &fs_devices->devices;
442 struct list_head *cur;
443 struct btrfs_device *device;
444 struct block_device *latest_bdev = NULL;
445 struct buffer_head *bh;
446 struct btrfs_super_block *disk_super;
447 u64 latest_devid = 0;
448 u64 latest_transid = 0;
449 u64 devid;
450 int seeding = 1;
451 int ret = 0;
452
453 list_for_each(cur, head) {
454 device = list_entry(cur, struct btrfs_device, dev_list);
455 if (device->bdev)
456 continue;
457 if (!device->name)
458 continue;
459
460 bdev = open_bdev_exclusive(device->name, flags, holder);
461 if (IS_ERR(bdev)) {
462 printk(KERN_INFO "open %s failed\n", device->name);
463 goto error;
464 }
465 set_blocksize(bdev, 4096);
466
467 bh = btrfs_read_dev_super(bdev);
468 if (!bh)
469 goto error_close;
470
471 disk_super = (struct btrfs_super_block *)bh->b_data;
472 devid = le64_to_cpu(disk_super->dev_item.devid);
473 if (devid != device->devid)
474 goto error_brelse;
475
476 if (memcmp(device->uuid, disk_super->dev_item.uuid,
477 BTRFS_UUID_SIZE))
478 goto error_brelse;
479
480 device->generation = btrfs_super_generation(disk_super);
481 if (!latest_transid || device->generation > latest_transid) {
482 latest_devid = devid;
483 latest_transid = device->generation;
484 latest_bdev = bdev;
485 }
486
487 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
488 device->writeable = 0;
489 } else {
490 device->writeable = !bdev_read_only(bdev);
491 seeding = 0;
492 }
493
494 device->bdev = bdev;
495 device->in_fs_metadata = 0;
496 device->mode = flags;
497
498 fs_devices->open_devices++;
499 if (device->writeable) {
500 fs_devices->rw_devices++;
501 list_add(&device->dev_alloc_list,
502 &fs_devices->alloc_list);
503 }
504 continue;
505
506error_brelse:
507 brelse(bh);
508error_close:
509 close_bdev_exclusive(bdev, FMODE_READ);
510error:
511 continue;
512 }
513 if (fs_devices->open_devices == 0) {
514 ret = -EIO;
515 goto out;
516 }
517 fs_devices->seeding = seeding;
518 fs_devices->opened = 1;
519 fs_devices->latest_bdev = latest_bdev;
520 fs_devices->latest_devid = latest_devid;
521 fs_devices->latest_trans = latest_transid;
522 fs_devices->total_rw_bytes = 0;
523out:
524 return ret;
525}
526
527int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
528 fmode_t flags, void *holder)
529{
530 int ret;
531
532 mutex_lock(&uuid_mutex);
533 if (fs_devices->opened) {
534 fs_devices->opened++;
535 ret = 0;
536 } else {
537 ret = __btrfs_open_devices(fs_devices, flags, holder);
538 }
539 mutex_unlock(&uuid_mutex);
540 return ret;
541}
542
543int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
544 struct btrfs_fs_devices **fs_devices_ret)
545{
546 struct btrfs_super_block *disk_super;
547 struct block_device *bdev;
548 struct buffer_head *bh;
549 int ret;
550 u64 devid;
551 u64 transid;
552
553 mutex_lock(&uuid_mutex);
554
555 bdev = open_bdev_exclusive(path, flags, holder);
556
557 if (IS_ERR(bdev)) {
558 ret = PTR_ERR(bdev);
559 goto error;
560 }
561
562 ret = set_blocksize(bdev, 4096);
563 if (ret)
564 goto error_close;
565 bh = btrfs_read_dev_super(bdev);
566 if (!bh) {
567 ret = -EIO;
568 goto error_close;
569 }
570 disk_super = (struct btrfs_super_block *)bh->b_data;
571 devid = le64_to_cpu(disk_super->dev_item.devid);
572 transid = btrfs_super_generation(disk_super);
573 if (disk_super->label[0])
574 printk(KERN_INFO "device label %s ", disk_super->label);
575 else {
576 /* FIXME, make a readl uuid parser */
577 printk(KERN_INFO "device fsid %llx-%llx ",
578 *(unsigned long long *)disk_super->fsid,
579 *(unsigned long long *)(disk_super->fsid + 8));
580 }
581 printk(KERN_INFO "devid %llu transid %llu %s\n",
582 (unsigned long long)devid, (unsigned long long)transid, path);
583 ret = device_list_add(path, disk_super, devid, fs_devices_ret);
584
585 brelse(bh);
586error_close:
587 close_bdev_exclusive(bdev, flags);
588error:
589 mutex_unlock(&uuid_mutex);
590 return ret;
591}
592
593/*
594 * this uses a pretty simple search, the expectation is that it is
595 * called very infrequently and that a given device has a small number
596 * of extents
597 */
598static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans,
599 struct btrfs_device *device,
600 u64 num_bytes, u64 *start)
601{
602 struct btrfs_key key;
603 struct btrfs_root *root = device->dev_root;
604 struct btrfs_dev_extent *dev_extent = NULL;
605 struct btrfs_path *path;
606 u64 hole_size = 0;
607 u64 last_byte = 0;
608 u64 search_start = 0;
609 u64 search_end = device->total_bytes;
610 int ret;
611 int slot = 0;
612 int start_found;
613 struct extent_buffer *l;
614
615 path = btrfs_alloc_path();
616 if (!path)
617 return -ENOMEM;
618 path->reada = 2;
619 start_found = 0;
620
621 /* FIXME use last free of some kind */
622
623 /* we don't want to overwrite the superblock on the drive,
624 * so we make sure to start at an offset of at least 1MB
625 */
626 search_start = max((u64)1024 * 1024, search_start);
627
628 if (root->fs_info->alloc_start + num_bytes <= device->total_bytes)
629 search_start = max(root->fs_info->alloc_start, search_start);
630
631 key.objectid = device->devid;
632 key.offset = search_start;
633 key.type = BTRFS_DEV_EXTENT_KEY;
634 ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
635 if (ret < 0)
636 goto error;
637 ret = btrfs_previous_item(root, path, 0, key.type);
638 if (ret < 0)
639 goto error;
640 l = path->nodes[0];
641 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
642 while (1) {
643 l = path->nodes[0];
644 slot = path->slots[0];
645 if (slot >= btrfs_header_nritems(l)) {
646 ret = btrfs_next_leaf(root, path);
647 if (ret == 0)
648 continue;
649 if (ret < 0)
650 goto error;
651no_more_items:
652 if (!start_found) {
653 if (search_start >= search_end) {
654 ret = -ENOSPC;
655 goto error;
656 }
657 *start = search_start;
658 start_found = 1;
659 goto check_pending;
660 }
661 *start = last_byte > search_start ?
662 last_byte : search_start;
663 if (search_end <= *start) {
664 ret = -ENOSPC;
665 goto error;
666 }
667 goto check_pending;
668 }
669 btrfs_item_key_to_cpu(l, &key, slot);
670
671 if (key.objectid < device->devid)
672 goto next;
673
674 if (key.objectid > device->devid)
675 goto no_more_items;
676
677 if (key.offset >= search_start && key.offset > last_byte &&
678 start_found) {
679 if (last_byte < search_start)
680 last_byte = search_start;
681 hole_size = key.offset - last_byte;
682 if (key.offset > last_byte &&
683 hole_size >= num_bytes) {
684 *start = last_byte;
685 goto check_pending;
686 }
687 }
688 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
689 goto next;
690
691 start_found = 1;
692 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
693 last_byte = key.offset + btrfs_dev_extent_length(l, dev_extent);
694next:
695 path->slots[0]++;
696 cond_resched();
697 }
698check_pending:
699 /* we have to make sure we didn't find an extent that has already
700 * been allocated by the map tree or the original allocation
701 */
702 BUG_ON(*start < search_start);
703
704 if (*start + num_bytes > search_end) {
705 ret = -ENOSPC;
706 goto error;
707 }
708 /* check for pending inserts here */
709 ret = 0;
710
711error:
712 btrfs_free_path(path);
713 return ret;
714}
715
716static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
717 struct btrfs_device *device,
718 u64 start)
719{
720 int ret;
721 struct btrfs_path *path;
722 struct btrfs_root *root = device->dev_root;
723 struct btrfs_key key;
724 struct btrfs_key found_key;
725 struct extent_buffer *leaf = NULL;
726 struct btrfs_dev_extent *extent = NULL;
727
728 path = btrfs_alloc_path();
729 if (!path)
730 return -ENOMEM;
731
732 key.objectid = device->devid;
733 key.offset = start;
734 key.type = BTRFS_DEV_EXTENT_KEY;
735
736 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
737 if (ret > 0) {
738 ret = btrfs_previous_item(root, path, key.objectid,
739 BTRFS_DEV_EXTENT_KEY);
740 BUG_ON(ret);
741 leaf = path->nodes[0];
742 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
743 extent = btrfs_item_ptr(leaf, path->slots[0],
744 struct btrfs_dev_extent);
745 BUG_ON(found_key.offset > start || found_key.offset +
746 btrfs_dev_extent_length(leaf, extent) < start);
747 ret = 0;
748 } else if (ret == 0) {
749 leaf = path->nodes[0];
750 extent = btrfs_item_ptr(leaf, path->slots[0],
751 struct btrfs_dev_extent);
752 }
753 BUG_ON(ret);
754
755 if (device->bytes_used > 0)
756 device->bytes_used -= btrfs_dev_extent_length(leaf, extent);
757 ret = btrfs_del_item(trans, root, path);
758 BUG_ON(ret);
759
760 btrfs_free_path(path);
761 return ret;
762}
763
764int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
765 struct btrfs_device *device,
766 u64 chunk_tree, u64 chunk_objectid,
767 u64 chunk_offset, u64 start, u64 num_bytes)
768{
769 int ret;
770 struct btrfs_path *path;
771 struct btrfs_root *root = device->dev_root;
772 struct btrfs_dev_extent *extent;
773 struct extent_buffer *leaf;
774 struct btrfs_key key;
775
776 WARN_ON(!device->in_fs_metadata);
777 path = btrfs_alloc_path();
778 if (!path)
779 return -ENOMEM;
780
781 key.objectid = device->devid;
782 key.offset = start;
783 key.type = BTRFS_DEV_EXTENT_KEY;
784 ret = btrfs_insert_empty_item(trans, root, path, &key,
785 sizeof(*extent));
786 BUG_ON(ret);
787
788 leaf = path->nodes[0];
789 extent = btrfs_item_ptr(leaf, path->slots[0],
790 struct btrfs_dev_extent);
791 btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree);
792 btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid);
793 btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
794
795 write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
796 (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent),
797 BTRFS_UUID_SIZE);
798
799 btrfs_set_dev_extent_length(leaf, extent, num_bytes);
800 btrfs_mark_buffer_dirty(leaf);
801 btrfs_free_path(path);
802 return ret;
803}
804
805static noinline int find_next_chunk(struct btrfs_root *root,
806 u64 objectid, u64 *offset)
807{
808 struct btrfs_path *path;
809 int ret;
810 struct btrfs_key key;
811 struct btrfs_chunk *chunk;
812 struct btrfs_key found_key;
813
814 path = btrfs_alloc_path();
815 BUG_ON(!path);
816
817 key.objectid = objectid;
818 key.offset = (u64)-1;
819 key.type = BTRFS_CHUNK_ITEM_KEY;
820
821 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
822 if (ret < 0)
823 goto error;
824
825 BUG_ON(ret == 0);
826
827 ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY);
828 if (ret) {
829 *offset = 0;
830 } else {
831 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
832 path->slots[0]);
833 if (found_key.objectid != objectid)
834 *offset = 0;
835 else {
836 chunk = btrfs_item_ptr(path->nodes[0], path->slots[0],
837 struct btrfs_chunk);
838 *offset = found_key.offset +
839 btrfs_chunk_length(path->nodes[0], chunk);
840 }
841 }
842 ret = 0;
843error:
844 btrfs_free_path(path);
845 return ret;
846}
847
848static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid)
849{
850 int ret;
851 struct btrfs_key key;
852 struct btrfs_key found_key;
853 struct btrfs_path *path;
854
855 root = root->fs_info->chunk_root;
856
857 path = btrfs_alloc_path();
858 if (!path)
859 return -ENOMEM;
860
861 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
862 key.type = BTRFS_DEV_ITEM_KEY;
863 key.offset = (u64)-1;
864
865 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
866 if (ret < 0)
867 goto error;
868
869 BUG_ON(ret == 0);
870
871 ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID,
872 BTRFS_DEV_ITEM_KEY);
873 if (ret) {
874 *objectid = 1;
875 } else {
876 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
877 path->slots[0]);
878 *objectid = found_key.offset + 1;
879 }
880 ret = 0;
881error:
882 btrfs_free_path(path);
883 return ret;
884}
885
886/*
887 * the device information is stored in the chunk root
888 * the btrfs_device struct should be fully filled in
889 */
890int btrfs_add_device(struct btrfs_trans_handle *trans,
891 struct btrfs_root *root,
892 struct btrfs_device *device)
893{
894 int ret;
895 struct btrfs_path *path;
896 struct btrfs_dev_item *dev_item;
897 struct extent_buffer *leaf;
898 struct btrfs_key key;
899 unsigned long ptr;
900
901 root = root->fs_info->chunk_root;
902
903 path = btrfs_alloc_path();
904 if (!path)
905 return -ENOMEM;
906
907 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
908 key.type = BTRFS_DEV_ITEM_KEY;
909 key.offset = device->devid;
910
911 ret = btrfs_insert_empty_item(trans, root, path, &key,
912 sizeof(*dev_item));
913 if (ret)
914 goto out;
915
916 leaf = path->nodes[0];
917 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
918
919 btrfs_set_device_id(leaf, dev_item, device->devid);
920 btrfs_set_device_generation(leaf, dev_item, 0);
921 btrfs_set_device_type(leaf, dev_item, device->type);
922 btrfs_set_device_io_align(leaf, dev_item, device->io_align);
923 btrfs_set_device_io_width(leaf, dev_item, device->io_width);
924 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
925 btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
926 btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
927 btrfs_set_device_group(leaf, dev_item, 0);
928 btrfs_set_device_seek_speed(leaf, dev_item, 0);
929 btrfs_set_device_bandwidth(leaf, dev_item, 0);
930 btrfs_set_device_start_offset(leaf, dev_item, 0);
931
932 ptr = (unsigned long)btrfs_device_uuid(dev_item);
933 write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
934 ptr = (unsigned long)btrfs_device_fsid(dev_item);
935 write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE);
936 btrfs_mark_buffer_dirty(leaf);
937
938 ret = 0;
939out:
940 btrfs_free_path(path);
941 return ret;
942}
943
944static int btrfs_rm_dev_item(struct btrfs_root *root,
945 struct btrfs_device *device)
946{
947 int ret;
948 struct btrfs_path *path;
949 struct btrfs_key key;
950 struct btrfs_trans_handle *trans;
951
952 root = root->fs_info->chunk_root;
953
954 path = btrfs_alloc_path();
955 if (!path)
956 return -ENOMEM;
957
958 trans = btrfs_start_transaction(root, 1);
959 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
960 key.type = BTRFS_DEV_ITEM_KEY;
961 key.offset = device->devid;
962 lock_chunks(root);
963
964 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
965 if (ret < 0)
966 goto out;
967
968 if (ret > 0) {
969 ret = -ENOENT;
970 goto out;
971 }
972
973 ret = btrfs_del_item(trans, root, path);
974 if (ret)
975 goto out;
976out:
977 btrfs_free_path(path);
978 unlock_chunks(root);
979 btrfs_commit_transaction(trans, root);
980 return ret;
981}
982
983int btrfs_rm_device(struct btrfs_root *root, char *device_path)
984{
985 struct btrfs_device *device;
986 struct btrfs_device *next_device;
987 struct block_device *bdev;
988 struct buffer_head *bh = NULL;
989 struct btrfs_super_block *disk_super;
990 u64 all_avail;
991 u64 devid;
992 u64 num_devices;
993 u8 *dev_uuid;
994 int ret = 0;
995
996 mutex_lock(&uuid_mutex);
997 mutex_lock(&root->fs_info->volume_mutex);
998
999 all_avail = root->fs_info->avail_data_alloc_bits |
1000 root->fs_info->avail_system_alloc_bits |
1001 root->fs_info->avail_metadata_alloc_bits;
1002
1003 if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) &&
1004 root->fs_info->fs_devices->rw_devices <= 4) {
1005 printk(KERN_ERR "btrfs: unable to go below four devices "
1006 "on raid10\n");
1007 ret = -EINVAL;
1008 goto out;
1009 }
1010
1011 if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) &&
1012 root->fs_info->fs_devices->rw_devices <= 2) {
1013 printk(KERN_ERR "btrfs: unable to go below two "
1014 "devices on raid1\n");
1015 ret = -EINVAL;
1016 goto out;
1017 }
1018
1019 if (strcmp(device_path, "missing") == 0) {
1020 struct list_head *cur;
1021 struct list_head *devices;
1022 struct btrfs_device *tmp;
1023
1024 device = NULL;
1025 devices = &root->fs_info->fs_devices->devices;
1026 list_for_each(cur, devices) {
1027 tmp = list_entry(cur, struct btrfs_device, dev_list);
1028 if (tmp->in_fs_metadata && !tmp->bdev) {
1029 device = tmp;
1030 break;
1031 }
1032 }
1033 bdev = NULL;
1034 bh = NULL;
1035 disk_super = NULL;
1036 if (!device) {
1037 printk(KERN_ERR "btrfs: no missing devices found to "
1038 "remove\n");
1039 goto out;
1040 }
1041 } else {
1042 bdev = open_bdev_exclusive(device_path, FMODE_READ,
1043 root->fs_info->bdev_holder);
1044 if (IS_ERR(bdev)) {
1045 ret = PTR_ERR(bdev);
1046 goto out;
1047 }
1048
1049 set_blocksize(bdev, 4096);
1050 bh = btrfs_read_dev_super(bdev);
1051 if (!bh) {
1052 ret = -EIO;
1053 goto error_close;
1054 }
1055 disk_super = (struct btrfs_super_block *)bh->b_data;
1056 devid = le64_to_cpu(disk_super->dev_item.devid);
1057 dev_uuid = disk_super->dev_item.uuid;
1058 device = btrfs_find_device(root, devid, dev_uuid,
1059 disk_super->fsid);
1060 if (!device) {
1061 ret = -ENOENT;
1062 goto error_brelse;
1063 }
1064 }
1065
1066 if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
1067 printk(KERN_ERR "btrfs: unable to remove the only writeable "
1068 "device\n");
1069 ret = -EINVAL;
1070 goto error_brelse;
1071 }
1072
1073 if (device->writeable) {
1074 list_del_init(&device->dev_alloc_list);
1075 root->fs_info->fs_devices->rw_devices--;
1076 }
1077
1078 ret = btrfs_shrink_device(device, 0);
1079 if (ret)
1080 goto error_brelse;
1081
1082 ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
1083 if (ret)
1084 goto error_brelse;
1085
1086 device->in_fs_metadata = 0;
1087 list_del_init(&device->dev_list);
1088 device->fs_devices->num_devices--;
1089
1090 next_device = list_entry(root->fs_info->fs_devices->devices.next,
1091 struct btrfs_device, dev_list);
1092 if (device->bdev == root->fs_info->sb->s_bdev)
1093 root->fs_info->sb->s_bdev = next_device->bdev;
1094 if (device->bdev == root->fs_info->fs_devices->latest_bdev)
1095 root->fs_info->fs_devices->latest_bdev = next_device->bdev;
1096
1097 if (device->bdev) {
1098 close_bdev_exclusive(device->bdev, device->mode);
1099 device->bdev = NULL;
1100 device->fs_devices->open_devices--;
1101 }
1102
1103 num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
1104 btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices);
1105
1106 if (device->fs_devices->open_devices == 0) {
1107 struct btrfs_fs_devices *fs_devices;
1108 fs_devices = root->fs_info->fs_devices;
1109 while (fs_devices) {
1110 if (fs_devices->seed == device->fs_devices)
1111 break;
1112 fs_devices = fs_devices->seed;
1113 }
1114 fs_devices->seed = device->fs_devices->seed;
1115 device->fs_devices->seed = NULL;
1116 __btrfs_close_devices(device->fs_devices);
1117 free_fs_devices(device->fs_devices);
1118 }
1119
1120 /*
1121 * at this point, the device is zero sized. We want to
1122 * remove it from the devices list and zero out the old super
1123 */
1124 if (device->writeable) {
1125 /* make sure this device isn't detected as part of
1126 * the FS anymore
1127 */
1128 memset(&disk_super->magic, 0, sizeof(disk_super->magic));
1129 set_buffer_dirty(bh);
1130 sync_dirty_buffer(bh);
1131 }
1132
1133 kfree(device->name);
1134 kfree(device);
1135 ret = 0;
1136
1137error_brelse:
1138 brelse(bh);
1139error_close:
1140 if (bdev)
1141 close_bdev_exclusive(bdev, FMODE_READ);
1142out:
1143 mutex_unlock(&root->fs_info->volume_mutex);
1144 mutex_unlock(&uuid_mutex);
1145 return ret;
1146}
1147
1148/*
1149 * does all the dirty work required for changing file system's UUID.
1150 */
1151static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
1152 struct btrfs_root *root)
1153{
1154 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
1155 struct btrfs_fs_devices *old_devices;
1156 struct btrfs_fs_devices *seed_devices;
1157 struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
1158 struct btrfs_device *device;
1159 u64 super_flags;
1160
1161 BUG_ON(!mutex_is_locked(&uuid_mutex));
1162 if (!fs_devices->seeding)
1163 return -EINVAL;
1164
1165 seed_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
1166 if (!seed_devices)
1167 return -ENOMEM;
1168
1169 old_devices = clone_fs_devices(fs_devices);
1170 if (IS_ERR(old_devices)) {
1171 kfree(seed_devices);
1172 return PTR_ERR(old_devices);
1173 }
1174
1175 list_add(&old_devices->list, &fs_uuids);
1176
1177 memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
1178 seed_devices->opened = 1;
1179 INIT_LIST_HEAD(&seed_devices->devices);
1180 INIT_LIST_HEAD(&seed_devices->alloc_list);
1181 list_splice_init(&fs_devices->devices, &seed_devices->devices);
1182 list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
1183 list_for_each_entry(device, &seed_devices->devices, dev_list) {
1184 device->fs_devices = seed_devices;
1185 }
1186
1187 fs_devices->seeding = 0;
1188 fs_devices->num_devices = 0;
1189 fs_devices->open_devices = 0;
1190 fs_devices->seed = seed_devices;
1191
1192 generate_random_uuid(fs_devices->fsid);
1193 memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1194 memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1195 super_flags = btrfs_super_flags(disk_super) &
1196 ~BTRFS_SUPER_FLAG_SEEDING;
1197 btrfs_set_super_flags(disk_super, super_flags);
1198
1199 return 0;
1200}
1201
1202/*
1203 * strore the expected generation for seed devices in device items.
1204 */
1205static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
1206 struct btrfs_root *root)
1207{
1208 struct btrfs_path *path;
1209 struct extent_buffer *leaf;
1210 struct btrfs_dev_item *dev_item;
1211 struct btrfs_device *device;
1212 struct btrfs_key key;
1213 u8 fs_uuid[BTRFS_UUID_SIZE];
1214 u8 dev_uuid[BTRFS_UUID_SIZE];
1215 u64 devid;
1216 int ret;
1217
1218 path = btrfs_alloc_path();
1219 if (!path)
1220 return -ENOMEM;
1221
1222 root = root->fs_info->chunk_root;
1223 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1224 key.offset = 0;
1225 key.type = BTRFS_DEV_ITEM_KEY;
1226
1227 while (1) {
1228 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1229 if (ret < 0)
1230 goto error;
1231
1232 leaf = path->nodes[0];
1233next_slot:
1234 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1235 ret = btrfs_next_leaf(root, path);
1236 if (ret > 0)
1237 break;
1238 if (ret < 0)
1239 goto error;
1240 leaf = path->nodes[0];
1241 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1242 btrfs_release_path(root, path);
1243 continue;
1244 }
1245
1246 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1247 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
1248 key.type != BTRFS_DEV_ITEM_KEY)
1249 break;
1250
1251 dev_item = btrfs_item_ptr(leaf, path->slots[0],
1252 struct btrfs_dev_item);
1253 devid = btrfs_device_id(leaf, dev_item);
1254 read_extent_buffer(leaf, dev_uuid,
1255 (unsigned long)btrfs_device_uuid(dev_item),
1256 BTRFS_UUID_SIZE);
1257 read_extent_buffer(leaf, fs_uuid,
1258 (unsigned long)btrfs_device_fsid(dev_item),
1259 BTRFS_UUID_SIZE);
1260 device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
1261 BUG_ON(!device);
1262
1263 if (device->fs_devices->seeding) {
1264 btrfs_set_device_generation(leaf, dev_item,
1265 device->generation);
1266 btrfs_mark_buffer_dirty(leaf);
1267 }
1268
1269 path->slots[0]++;
1270 goto next_slot;
1271 }
1272 ret = 0;
1273error:
1274 btrfs_free_path(path);
1275 return ret;
1276}
1277
1278int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1279{
1280 struct btrfs_trans_handle *trans;
1281 struct btrfs_device *device;
1282 struct block_device *bdev;
1283 struct list_head *cur;
1284 struct list_head *devices;
1285 struct super_block *sb = root->fs_info->sb;
1286 u64 total_bytes;
1287 int seeding_dev = 0;
1288 int ret = 0;
1289
1290 if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
1291 return -EINVAL;
1292
1293 bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder);
1294 if (!bdev)
1295 return -EIO;
1296
1297 if (root->fs_info->fs_devices->seeding) {
1298 seeding_dev = 1;
1299 down_write(&sb->s_umount);
1300 mutex_lock(&uuid_mutex);
1301 }
1302
1303 filemap_write_and_wait(bdev->bd_inode->i_mapping);
1304 mutex_lock(&root->fs_info->volume_mutex);
1305
1306 devices = &root->fs_info->fs_devices->devices;
1307 list_for_each(cur, devices) {
1308 device = list_entry(cur, struct btrfs_device, dev_list);
1309 if (device->bdev == bdev) {
1310 ret = -EEXIST;
1311 goto error;
1312 }
1313 }
1314
1315 device = kzalloc(sizeof(*device), GFP_NOFS);
1316 if (!device) {
1317 /* we can safely leave the fs_devices entry around */
1318 ret = -ENOMEM;
1319 goto error;
1320 }
1321
1322 device->name = kstrdup(device_path, GFP_NOFS);
1323 if (!device->name) {
1324 kfree(device);
1325 ret = -ENOMEM;
1326 goto error;
1327 }
1328
1329 ret = find_next_devid(root, &device->devid);
1330 if (ret) {
1331 kfree(device);
1332 goto error;
1333 }
1334
1335 trans = btrfs_start_transaction(root, 1);
1336 lock_chunks(root);
1337
1338 device->barriers = 1;
1339 device->writeable = 1;
1340 device->work.func = pending_bios_fn;
1341 generate_random_uuid(device->uuid);
1342 spin_lock_init(&device->io_lock);
1343 device->generation = trans->transid;
1344 device->io_width = root->sectorsize;
1345 device->io_align = root->sectorsize;
1346 device->sector_size = root->sectorsize;
1347 device->total_bytes = i_size_read(bdev->bd_inode);
1348 device->dev_root = root->fs_info->dev_root;
1349 device->bdev = bdev;
1350 device->in_fs_metadata = 1;
1351 device->mode = 0;
1352 set_blocksize(device->bdev, 4096);
1353
1354 if (seeding_dev) {
1355 sb->s_flags &= ~MS_RDONLY;
1356 ret = btrfs_prepare_sprout(trans, root);
1357 BUG_ON(ret);
1358 }
1359
1360 device->fs_devices = root->fs_info->fs_devices;
1361 list_add(&device->dev_list, &root->fs_info->fs_devices->devices);
1362 list_add(&device->dev_alloc_list,
1363 &root->fs_info->fs_devices->alloc_list);
1364 root->fs_info->fs_devices->num_devices++;
1365 root->fs_info->fs_devices->open_devices++;
1366 root->fs_info->fs_devices->rw_devices++;
1367 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
1368
1369 total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
1370 btrfs_set_super_total_bytes(&root->fs_info->super_copy,
1371 total_bytes + device->total_bytes);
1372
1373 total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
1374 btrfs_set_super_num_devices(&root->fs_info->super_copy,
1375 total_bytes + 1);
1376
1377 if (seeding_dev) {
1378 ret = init_first_rw_device(trans, root, device);
1379 BUG_ON(ret);
1380 ret = btrfs_finish_sprout(trans, root);
1381 BUG_ON(ret);
1382 } else {
1383 ret = btrfs_add_device(trans, root, device);
1384 }
1385
1386 unlock_chunks(root);
1387 btrfs_commit_transaction(trans, root);
1388
1389 if (seeding_dev) {
1390 mutex_unlock(&uuid_mutex);
1391 up_write(&sb->s_umount);
1392
1393 ret = btrfs_relocate_sys_chunks(root);
1394 BUG_ON(ret);
1395 }
1396out:
1397 mutex_unlock(&root->fs_info->volume_mutex);
1398 return ret;
1399error:
1400 close_bdev_exclusive(bdev, 0);
1401 if (seeding_dev) {
1402 mutex_unlock(&uuid_mutex);
1403 up_write(&sb->s_umount);
1404 }
1405 goto out;
1406}
1407
1408static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
1409 struct btrfs_device *device)
1410{
1411 int ret;
1412 struct btrfs_path *path;
1413 struct btrfs_root *root;
1414 struct btrfs_dev_item *dev_item;
1415 struct extent_buffer *leaf;
1416 struct btrfs_key key;
1417
1418 root = device->dev_root->fs_info->chunk_root;
1419
1420 path = btrfs_alloc_path();
1421 if (!path)
1422 return -ENOMEM;
1423
1424 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1425 key.type = BTRFS_DEV_ITEM_KEY;
1426 key.offset = device->devid;
1427
1428 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1429 if (ret < 0)
1430 goto out;
1431
1432 if (ret > 0) {
1433 ret = -ENOENT;
1434 goto out;
1435 }
1436
1437 leaf = path->nodes[0];
1438 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1439
1440 btrfs_set_device_id(leaf, dev_item, device->devid);
1441 btrfs_set_device_type(leaf, dev_item, device->type);
1442 btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1443 btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1444 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1445 btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
1446 btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
1447 btrfs_mark_buffer_dirty(leaf);
1448
1449out:
1450 btrfs_free_path(path);
1451 return ret;
1452}
1453
1454static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
1455 struct btrfs_device *device, u64 new_size)
1456{
1457 struct btrfs_super_block *super_copy =
1458 &device->dev_root->fs_info->super_copy;
1459 u64 old_total = btrfs_super_total_bytes(super_copy);
1460 u64 diff = new_size - device->total_bytes;
1461
1462 if (!device->writeable)
1463 return -EACCES;
1464 if (new_size <= device->total_bytes)
1465 return -EINVAL;
1466
1467 btrfs_set_super_total_bytes(super_copy, old_total + diff);
1468 device->fs_devices->total_rw_bytes += diff;
1469
1470 device->total_bytes = new_size;
1471 return btrfs_update_device(trans, device);
1472}
1473
1474int btrfs_grow_device(struct btrfs_trans_handle *trans,
1475 struct btrfs_device *device, u64 new_size)
1476{
1477 int ret;
1478 lock_chunks(device->dev_root);
1479 ret = __btrfs_grow_device(trans, device, new_size);
1480 unlock_chunks(device->dev_root);
1481 return ret;
1482}
1483
1484static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
1485 struct btrfs_root *root,
1486 u64 chunk_tree, u64 chunk_objectid,
1487 u64 chunk_offset)
1488{
1489 int ret;
1490 struct btrfs_path *path;
1491 struct btrfs_key key;
1492
1493 root = root->fs_info->chunk_root;
1494 path = btrfs_alloc_path();
1495 if (!path)
1496 return -ENOMEM;
1497
1498 key.objectid = chunk_objectid;
1499 key.offset = chunk_offset;
1500 key.type = BTRFS_CHUNK_ITEM_KEY;
1501
1502 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1503 BUG_ON(ret);
1504
1505 ret = btrfs_del_item(trans, root, path);
1506 BUG_ON(ret);
1507
1508 btrfs_free_path(path);
1509 return 0;
1510}
1511
1512static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
1513 chunk_offset)
1514{
1515 struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
1516 struct btrfs_disk_key *disk_key;
1517 struct btrfs_chunk *chunk;
1518 u8 *ptr;
1519 int ret = 0;
1520 u32 num_stripes;
1521 u32 array_size;
1522 u32 len = 0;
1523 u32 cur;
1524 struct btrfs_key key;
1525
1526 array_size = btrfs_super_sys_array_size(super_copy);
1527
1528 ptr = super_copy->sys_chunk_array;
1529 cur = 0;
1530
1531 while (cur < array_size) {
1532 disk_key = (struct btrfs_disk_key *)ptr;
1533 btrfs_disk_key_to_cpu(&key, disk_key);
1534
1535 len = sizeof(*disk_key);
1536
1537 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
1538 chunk = (struct btrfs_chunk *)(ptr + len);
1539 num_stripes = btrfs_stack_chunk_num_stripes(chunk);
1540 len += btrfs_chunk_item_size(num_stripes);
1541 } else {
1542 ret = -EIO;
1543 break;
1544 }
1545 if (key.objectid == chunk_objectid &&
1546 key.offset == chunk_offset) {
1547 memmove(ptr, ptr + len, array_size - (cur + len));
1548 array_size -= len;
1549 btrfs_set_super_sys_array_size(super_copy, array_size);
1550 } else {
1551 ptr += len;
1552 cur += len;
1553 }
1554 }
1555 return ret;
1556}
1557
1558static int btrfs_relocate_chunk(struct btrfs_root *root,
1559 u64 chunk_tree, u64 chunk_objectid,
1560 u64 chunk_offset)
1561{
1562 struct extent_map_tree *em_tree;
1563 struct btrfs_root *extent_root;
1564 struct btrfs_trans_handle *trans;
1565 struct extent_map *em;
1566 struct map_lookup *map;
1567 int ret;
1568 int i;
1569
1570 printk(KERN_INFO "btrfs relocating chunk %llu\n",
1571 (unsigned long long)chunk_offset);
1572 root = root->fs_info->chunk_root;
1573 extent_root = root->fs_info->extent_root;
1574 em_tree = &root->fs_info->mapping_tree.map_tree;
1575
1576 /* step one, relocate all the extents inside this chunk */
1577 ret = btrfs_relocate_block_group(extent_root, chunk_offset);
1578 BUG_ON(ret);
1579
1580 trans = btrfs_start_transaction(root, 1);
1581 BUG_ON(!trans);
1582
1583 lock_chunks(root);
1584
1585 /*
1586 * step two, delete the device extents and the
1587 * chunk tree entries
1588 */
1589 spin_lock(&em_tree->lock);
1590 em = lookup_extent_mapping(em_tree, chunk_offset, 1);
1591 spin_unlock(&em_tree->lock);
1592
1593 BUG_ON(em->start > chunk_offset ||
1594 em->start + em->len < chunk_offset);
1595 map = (struct map_lookup *)em->bdev;
1596
1597 for (i = 0; i < map->num_stripes; i++) {
1598 ret = btrfs_free_dev_extent(trans, map->stripes[i].dev,
1599 map->stripes[i].physical);
1600 BUG_ON(ret);
1601
1602 if (map->stripes[i].dev) {
1603 ret = btrfs_update_device(trans, map->stripes[i].dev);
1604 BUG_ON(ret);
1605 }
1606 }
1607 ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
1608 chunk_offset);
1609
1610 BUG_ON(ret);
1611
1612 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
1613 ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
1614 BUG_ON(ret);
1615 }
1616
1617 ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
1618 BUG_ON(ret);
1619
1620 spin_lock(&em_tree->lock);
1621 remove_extent_mapping(em_tree, em);
1622 spin_unlock(&em_tree->lock);
1623
1624 kfree(map);
1625 em->bdev = NULL;
1626
1627 /* once for the tree */
1628 free_extent_map(em);
1629 /* once for us */
1630 free_extent_map(em);
1631
1632 unlock_chunks(root);
1633 btrfs_end_transaction(trans, root);
1634 return 0;
1635}
1636
1637static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
1638{
1639 struct btrfs_root *chunk_root = root->fs_info->chunk_root;
1640 struct btrfs_path *path;
1641 struct extent_buffer *leaf;
1642 struct btrfs_chunk *chunk;
1643 struct btrfs_key key;
1644 struct btrfs_key found_key;
1645 u64 chunk_tree = chunk_root->root_key.objectid;
1646 u64 chunk_type;
1647 int ret;
1648
1649 path = btrfs_alloc_path();
1650 if (!path)
1651 return -ENOMEM;
1652
1653 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
1654 key.offset = (u64)-1;
1655 key.type = BTRFS_CHUNK_ITEM_KEY;
1656
1657 while (1) {
1658 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
1659 if (ret < 0)
1660 goto error;
1661 BUG_ON(ret == 0);
1662
1663 ret = btrfs_previous_item(chunk_root, path, key.objectid,
1664 key.type);
1665 if (ret < 0)
1666 goto error;
1667 if (ret > 0)
1668 break;
1669
1670 leaf = path->nodes[0];
1671 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1672
1673 chunk = btrfs_item_ptr(leaf, path->slots[0],
1674 struct btrfs_chunk);
1675 chunk_type = btrfs_chunk_type(leaf, chunk);
1676 btrfs_release_path(chunk_root, path);
1677
1678 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
1679 ret = btrfs_relocate_chunk(chunk_root, chunk_tree,
1680 found_key.objectid,
1681 found_key.offset);
1682 BUG_ON(ret);
1683 }
1684
1685 if (found_key.offset == 0)
1686 break;
1687 key.offset = found_key.offset - 1;
1688 }
1689 ret = 0;
1690error:
1691 btrfs_free_path(path);
1692 return ret;
1693}
1694
1695static u64 div_factor(u64 num, int factor)
1696{
1697 if (factor == 10)
1698 return num;
1699 num *= factor;
1700 do_div(num, 10);
1701 return num;
1702}
1703
1704int btrfs_balance(struct btrfs_root *dev_root)
1705{
1706 int ret;
1707 struct list_head *cur;
1708 struct list_head *devices = &dev_root->fs_info->fs_devices->devices;
1709 struct btrfs_device *device;
1710 u64 old_size;
1711 u64 size_to_free;
1712 struct btrfs_path *path;
1713 struct btrfs_key key;
1714 struct btrfs_chunk *chunk;
1715 struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root;
1716 struct btrfs_trans_handle *trans;
1717 struct btrfs_key found_key;
1718
1719 if (dev_root->fs_info->sb->s_flags & MS_RDONLY)
1720 return -EROFS;
1721
1722 mutex_lock(&dev_root->fs_info->volume_mutex);
1723 dev_root = dev_root->fs_info->dev_root;
1724
1725 /* step one make some room on all the devices */
1726 list_for_each(cur, devices) {
1727 device = list_entry(cur, struct btrfs_device, dev_list);
1728 old_size = device->total_bytes;
1729 size_to_free = div_factor(old_size, 1);
1730 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
1731 if (!device->writeable ||
1732 device->total_bytes - device->bytes_used > size_to_free)
1733 continue;
1734
1735 ret = btrfs_shrink_device(device, old_size - size_to_free);
1736 BUG_ON(ret);
1737
1738 trans = btrfs_start_transaction(dev_root, 1);
1739 BUG_ON(!trans);
1740
1741 ret = btrfs_grow_device(trans, device, old_size);
1742 BUG_ON(ret);
1743
1744 btrfs_end_transaction(trans, dev_root);
1745 }
1746
1747 /* step two, relocate all the chunks */
1748 path = btrfs_alloc_path();
1749 BUG_ON(!path);
1750
1751 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
1752 key.offset = (u64)-1;
1753 key.type = BTRFS_CHUNK_ITEM_KEY;
1754
1755 while (1) {
1756 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
1757 if (ret < 0)
1758 goto error;
1759
1760 /*
1761 * this shouldn't happen, it means the last relocate
1762 * failed
1763 */
1764 if (ret == 0)
1765 break;
1766
1767 ret = btrfs_previous_item(chunk_root, path, 0,
1768 BTRFS_CHUNK_ITEM_KEY);
1769 if (ret)
1770 break;
1771
1772 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1773 path->slots[0]);
1774 if (found_key.objectid != key.objectid)
1775 break;
1776
1777 chunk = btrfs_item_ptr(path->nodes[0],
1778 path->slots[0],
1779 struct btrfs_chunk);
1780 key.offset = found_key.offset;
1781 /* chunk zero is special */
1782 if (key.offset == 0)
1783 break;
1784
1785 btrfs_release_path(chunk_root, path);
1786 ret = btrfs_relocate_chunk(chunk_root,
1787 chunk_root->root_key.objectid,
1788 found_key.objectid,
1789 found_key.offset);
1790 BUG_ON(ret);
1791 }
1792 ret = 0;
1793error:
1794 btrfs_free_path(path);
1795 mutex_unlock(&dev_root->fs_info->volume_mutex);
1796 return ret;
1797}
1798
1799/*
1800 * shrinking a device means finding all of the device extents past
1801 * the new size, and then following the back refs to the chunks.
1802 * The chunk relocation code actually frees the device extent
1803 */
1804int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
1805{
1806 struct btrfs_trans_handle *trans;
1807 struct btrfs_root *root = device->dev_root;
1808 struct btrfs_dev_extent *dev_extent = NULL;
1809 struct btrfs_path *path;
1810 u64 length;
1811 u64 chunk_tree;
1812 u64 chunk_objectid;
1813 u64 chunk_offset;
1814 int ret;
1815 int slot;
1816 struct extent_buffer *l;
1817 struct btrfs_key key;
1818 struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
1819 u64 old_total = btrfs_super_total_bytes(super_copy);
1820 u64 diff = device->total_bytes - new_size;
1821
1822 if (new_size >= device->total_bytes)
1823 return -EINVAL;
1824
1825 path = btrfs_alloc_path();
1826 if (!path)
1827 return -ENOMEM;
1828
1829 trans = btrfs_start_transaction(root, 1);
1830 if (!trans) {
1831 ret = -ENOMEM;
1832 goto done;
1833 }
1834
1835 path->reada = 2;
1836
1837 lock_chunks(root);
1838
1839 device->total_bytes = new_size;
1840 if (device->writeable)
1841 device->fs_devices->total_rw_bytes -= diff;
1842 ret = btrfs_update_device(trans, device);
1843 if (ret) {
1844 unlock_chunks(root);
1845 btrfs_end_transaction(trans, root);
1846 goto done;
1847 }
1848 WARN_ON(diff > old_total);
1849 btrfs_set_super_total_bytes(super_copy, old_total - diff);
1850 unlock_chunks(root);
1851 btrfs_end_transaction(trans, root);
1852
1853 key.objectid = device->devid;
1854 key.offset = (u64)-1;
1855 key.type = BTRFS_DEV_EXTENT_KEY;
1856
1857 while (1) {
1858 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1859 if (ret < 0)
1860 goto done;
1861
1862 ret = btrfs_previous_item(root, path, 0, key.type);
1863 if (ret < 0)
1864 goto done;
1865 if (ret) {
1866 ret = 0;
1867 goto done;
1868 }
1869
1870 l = path->nodes[0];
1871 slot = path->slots[0];
1872 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
1873
1874 if (key.objectid != device->devid)
1875 goto done;
1876
1877 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1878 length = btrfs_dev_extent_length(l, dev_extent);
1879
1880 if (key.offset + length <= new_size)
1881 goto done;
1882
1883 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
1884 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
1885 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
1886 btrfs_release_path(root, path);
1887
1888 ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
1889 chunk_offset);
1890 if (ret)
1891 goto done;
1892 }
1893
1894done:
1895 btrfs_free_path(path);
1896 return ret;
1897}
1898
1899static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
1900 struct btrfs_root *root,
1901 struct btrfs_key *key,
1902 struct btrfs_chunk *chunk, int item_size)
1903{
1904 struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
1905 struct btrfs_disk_key disk_key;
1906 u32 array_size;
1907 u8 *ptr;
1908
1909 array_size = btrfs_super_sys_array_size(super_copy);
1910 if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
1911 return -EFBIG;
1912
1913 ptr = super_copy->sys_chunk_array + array_size;
1914 btrfs_cpu_key_to_disk(&disk_key, key);
1915 memcpy(ptr, &disk_key, sizeof(disk_key));
1916 ptr += sizeof(disk_key);
1917 memcpy(ptr, chunk, item_size);
1918 item_size += sizeof(disk_key);
1919 btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
1920 return 0;
1921}
1922
1923static noinline u64 chunk_bytes_by_type(u64 type, u64 calc_size,
1924 int num_stripes, int sub_stripes)
1925{
1926 if (type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP))
1927 return calc_size;
1928 else if (type & BTRFS_BLOCK_GROUP_RAID10)
1929 return calc_size * (num_stripes / sub_stripes);
1930 else
1931 return calc_size * num_stripes;
1932}
1933
1934static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
1935 struct btrfs_root *extent_root,
1936 struct map_lookup **map_ret,
1937 u64 *num_bytes, u64 *stripe_size,
1938 u64 start, u64 type)
1939{
1940 struct btrfs_fs_info *info = extent_root->fs_info;
1941 struct btrfs_device *device = NULL;
1942 struct btrfs_fs_devices *fs_devices = info->fs_devices;
1943 struct list_head *cur;
1944 struct map_lookup *map = NULL;
1945 struct extent_map_tree *em_tree;
1946 struct extent_map *em;
1947 struct list_head private_devs;
1948 int min_stripe_size = 1 * 1024 * 1024;
1949 u64 calc_size = 1024 * 1024 * 1024;
1950 u64 max_chunk_size = calc_size;
1951 u64 min_free;
1952 u64 avail;
1953 u64 max_avail = 0;
1954 u64 dev_offset;
1955 int num_stripes = 1;
1956 int min_stripes = 1;
1957 int sub_stripes = 0;
1958 int looped = 0;
1959 int ret;
1960 int index;
1961 int stripe_len = 64 * 1024;
1962
1963 if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
1964 (type & BTRFS_BLOCK_GROUP_DUP)) {
1965 WARN_ON(1);
1966 type &= ~BTRFS_BLOCK_GROUP_DUP;
1967 }
1968 if (list_empty(&fs_devices->alloc_list))
1969 return -ENOSPC;
1970
1971 if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
1972 num_stripes = fs_devices->rw_devices;
1973 min_stripes = 2;
1974 }
1975 if (type & (BTRFS_BLOCK_GROUP_DUP)) {
1976 num_stripes = 2;
1977 min_stripes = 2;
1978 }
1979 if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
1980 num_stripes = min_t(u64, 2, fs_devices->rw_devices);
1981 if (num_stripes < 2)
1982 return -ENOSPC;
1983 min_stripes = 2;
1984 }
1985 if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
1986 num_stripes = fs_devices->rw_devices;
1987 if (num_stripes < 4)
1988 return -ENOSPC;
1989 num_stripes &= ~(u32)1;
1990 sub_stripes = 2;
1991 min_stripes = 4;
1992 }
1993
1994 if (type & BTRFS_BLOCK_GROUP_DATA) {
1995 max_chunk_size = 10 * calc_size;
1996 min_stripe_size = 64 * 1024 * 1024;
1997 } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
1998 max_chunk_size = 4 * calc_size;
1999 min_stripe_size = 32 * 1024 * 1024;
2000 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
2001 calc_size = 8 * 1024 * 1024;
2002 max_chunk_size = calc_size * 2;
2003 min_stripe_size = 1 * 1024 * 1024;
2004 }
2005
2006 /* we don't want a chunk larger than 10% of writeable space */
2007 max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
2008 max_chunk_size);
2009
2010again:
2011 if (!map || map->num_stripes != num_stripes) {
2012 kfree(map);
2013 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
2014 if (!map)
2015 return -ENOMEM;
2016 map->num_stripes = num_stripes;
2017 }
2018
2019 if (calc_size * num_stripes > max_chunk_size) {
2020 calc_size = max_chunk_size;
2021 do_div(calc_size, num_stripes);
2022 do_div(calc_size, stripe_len);
2023 calc_size *= stripe_len;
2024 }
2025 /* we don't want tiny stripes */
2026 calc_size = max_t(u64, min_stripe_size, calc_size);
2027
2028 do_div(calc_size, stripe_len);
2029 calc_size *= stripe_len;
2030
2031 cur = fs_devices->alloc_list.next;
2032 index = 0;
2033
2034 if (type & BTRFS_BLOCK_GROUP_DUP)
2035 min_free = calc_size * 2;
2036 else
2037 min_free = calc_size;
2038
2039 /*
2040 * we add 1MB because we never use the first 1MB of the device, unless
2041 * we've looped, then we are likely allocating the maximum amount of
2042 * space left already
2043 */
2044 if (!looped)
2045 min_free += 1024 * 1024;
2046
2047 INIT_LIST_HEAD(&private_devs);
2048 while (index < num_stripes) {
2049 device = list_entry(cur, struct btrfs_device, dev_alloc_list);
2050 BUG_ON(!device->writeable);
2051 if (device->total_bytes > device->bytes_used)
2052 avail = device->total_bytes - device->bytes_used;
2053 else
2054 avail = 0;
2055 cur = cur->next;
2056
2057 if (device->in_fs_metadata && avail >= min_free) {
2058 ret = find_free_dev_extent(trans, device,
2059 min_free, &dev_offset);
2060 if (ret == 0) {
2061 list_move_tail(&device->dev_alloc_list,
2062 &private_devs);
2063 map->stripes[index].dev = device;
2064 map->stripes[index].physical = dev_offset;
2065 index++;
2066 if (type & BTRFS_BLOCK_GROUP_DUP) {
2067 map->stripes[index].dev = device;
2068 map->stripes[index].physical =
2069 dev_offset + calc_size;
2070 index++;
2071 }
2072 }
2073 } else if (device->in_fs_metadata && avail > max_avail)
2074 max_avail = avail;
2075 if (cur == &fs_devices->alloc_list)
2076 break;
2077 }
2078 list_splice(&private_devs, &fs_devices->alloc_list);
2079 if (index < num_stripes) {
2080 if (index >= min_stripes) {
2081 num_stripes = index;
2082 if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
2083 num_stripes /= sub_stripes;
2084 num_stripes *= sub_stripes;
2085 }
2086 looped = 1;
2087 goto again;
2088 }
2089 if (!looped && max_avail > 0) {
2090 looped = 1;
2091 calc_size = max_avail;
2092 goto again;
2093 }
2094 kfree(map);
2095 return -ENOSPC;
2096 }
2097 map->sector_size = extent_root->sectorsize;
2098 map->stripe_len = stripe_len;
2099 map->io_align = stripe_len;
2100 map->io_width = stripe_len;
2101 map->type = type;
2102 map->num_stripes = num_stripes;
2103 map->sub_stripes = sub_stripes;
2104
2105 *map_ret = map;
2106 *stripe_size = calc_size;
2107 *num_bytes = chunk_bytes_by_type(type, calc_size,
2108 num_stripes, sub_stripes);
2109
2110 em = alloc_extent_map(GFP_NOFS);
2111 if (!em) {
2112 kfree(map);
2113 return -ENOMEM;
2114 }
2115 em->bdev = (struct block_device *)map;
2116 em->start = start;
2117 em->len = *num_bytes;
2118 em->block_start = 0;
2119 em->block_len = em->len;
2120
2121 em_tree = &extent_root->fs_info->mapping_tree.map_tree;
2122 spin_lock(&em_tree->lock);
2123 ret = add_extent_mapping(em_tree, em);
2124 spin_unlock(&em_tree->lock);
2125 BUG_ON(ret);
2126 free_extent_map(em);
2127
2128 ret = btrfs_make_block_group(trans, extent_root, 0, type,
2129 BTRFS_FIRST_CHUNK_TREE_OBJECTID,
2130 start, *num_bytes);
2131 BUG_ON(ret);
2132
2133 index = 0;
2134 while (index < map->num_stripes) {
2135 device = map->stripes[index].dev;
2136 dev_offset = map->stripes[index].physical;
2137
2138 ret = btrfs_alloc_dev_extent(trans, device,
2139 info->chunk_root->root_key.objectid,
2140 BTRFS_FIRST_CHUNK_TREE_OBJECTID,
2141 start, dev_offset, calc_size);
2142 BUG_ON(ret);
2143 index++;
2144 }
2145
2146 return 0;
2147}
2148
2149static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
2150 struct btrfs_root *extent_root,
2151 struct map_lookup *map, u64 chunk_offset,
2152 u64 chunk_size, u64 stripe_size)
2153{
2154 u64 dev_offset;
2155 struct btrfs_key key;
2156 struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
2157 struct btrfs_device *device;
2158 struct btrfs_chunk *chunk;
2159 struct btrfs_stripe *stripe;
2160 size_t item_size = btrfs_chunk_item_size(map->num_stripes);
2161 int index = 0;
2162 int ret;
2163
2164 chunk = kzalloc(item_size, GFP_NOFS);
2165 if (!chunk)
2166 return -ENOMEM;
2167
2168 index = 0;
2169 while (index < map->num_stripes) {
2170 device = map->stripes[index].dev;
2171 device->bytes_used += stripe_size;
2172 ret = btrfs_update_device(trans, device);
2173 BUG_ON(ret);
2174 index++;
2175 }
2176
2177 index = 0;
2178 stripe = &chunk->stripe;
2179 while (index < map->num_stripes) {
2180 device = map->stripes[index].dev;
2181 dev_offset = map->stripes[index].physical;
2182
2183 btrfs_set_stack_stripe_devid(stripe, device->devid);
2184 btrfs_set_stack_stripe_offset(stripe, dev_offset);
2185 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
2186 stripe++;
2187 index++;
2188 }
2189
2190 btrfs_set_stack_chunk_length(chunk, chunk_size);
2191 btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
2192 btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
2193 btrfs_set_stack_chunk_type(chunk, map->type);
2194 btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
2195 btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
2196 btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
2197 btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize);
2198 btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
2199
2200 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2201 key.type = BTRFS_CHUNK_ITEM_KEY;
2202 key.offset = chunk_offset;
2203
2204 ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
2205 BUG_ON(ret);
2206
2207 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
2208 ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk,
2209 item_size);
2210 BUG_ON(ret);
2211 }
2212 kfree(chunk);
2213 return 0;
2214}
2215
2216/*
2217 * Chunk allocation falls into two parts. The first part does works
2218 * that make the new allocated chunk useable, but not do any operation
2219 * that modifies the chunk tree. The second part does the works that
2220 * require modifying the chunk tree. This division is important for the
2221 * bootstrap process of adding storage to a seed btrfs.
2222 */
2223int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2224 struct btrfs_root *extent_root, u64 type)
2225{
2226 u64 chunk_offset;
2227 u64 chunk_size;
2228 u64 stripe_size;
2229 struct map_lookup *map;
2230 struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
2231 int ret;
2232
2233 ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
2234 &chunk_offset);
2235 if (ret)
2236 return ret;
2237
2238 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
2239 &stripe_size, chunk_offset, type);
2240 if (ret)
2241 return ret;
2242
2243 ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
2244 chunk_size, stripe_size);
2245 BUG_ON(ret);
2246 return 0;
2247}
2248
2249static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
2250 struct btrfs_root *root,
2251 struct btrfs_device *device)
2252{
2253 u64 chunk_offset;
2254 u64 sys_chunk_offset;
2255 u64 chunk_size;
2256 u64 sys_chunk_size;
2257 u64 stripe_size;
2258 u64 sys_stripe_size;
2259 u64 alloc_profile;
2260 struct map_lookup *map;
2261 struct map_lookup *sys_map;
2262 struct btrfs_fs_info *fs_info = root->fs_info;
2263 struct btrfs_root *extent_root = fs_info->extent_root;
2264 int ret;
2265
2266 ret = find_next_chunk(fs_info->chunk_root,
2267 BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset);
2268 BUG_ON(ret);
2269
2270 alloc_profile = BTRFS_BLOCK_GROUP_METADATA |
2271 (fs_info->metadata_alloc_profile &
2272 fs_info->avail_metadata_alloc_bits);
2273 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
2274
2275 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
2276 &stripe_size, chunk_offset, alloc_profile);
2277 BUG_ON(ret);
2278
2279 sys_chunk_offset = chunk_offset + chunk_size;
2280
2281 alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM |
2282 (fs_info->system_alloc_profile &
2283 fs_info->avail_system_alloc_bits);
2284 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
2285
2286 ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
2287 &sys_chunk_size, &sys_stripe_size,
2288 sys_chunk_offset, alloc_profile);
2289 BUG_ON(ret);
2290
2291 ret = btrfs_add_device(trans, fs_info->chunk_root, device);
2292 BUG_ON(ret);
2293
2294 /*
2295 * Modifying chunk tree needs allocating new blocks from both
2296 * system block group and metadata block group. So we only can
2297 * do operations require modifying the chunk tree after both
2298 * block groups were created.
2299 */
2300 ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
2301 chunk_size, stripe_size);
2302 BUG_ON(ret);
2303
2304 ret = __finish_chunk_alloc(trans, extent_root, sys_map,
2305 sys_chunk_offset, sys_chunk_size,
2306 sys_stripe_size);
2307 BUG_ON(ret);
2308 return 0;
2309}
2310
2311int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
2312{
2313 struct extent_map *em;
2314 struct map_lookup *map;
2315 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
2316 int readonly = 0;
2317 int i;
2318
2319 spin_lock(&map_tree->map_tree.lock);
2320 em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
2321 spin_unlock(&map_tree->map_tree.lock);
2322 if (!em)
2323 return 1;
2324
2325 map = (struct map_lookup *)em->bdev;
2326 for (i = 0; i < map->num_stripes; i++) {
2327 if (!map->stripes[i].dev->writeable) {
2328 readonly = 1;
2329 break;
2330 }
2331 }
2332 free_extent_map(em);
2333 return readonly;
2334}
2335
2336void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
2337{
2338 extent_map_tree_init(&tree->map_tree, GFP_NOFS);
2339}
2340
2341void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
2342{
2343 struct extent_map *em;
2344
2345 while (1) {
2346 spin_lock(&tree->map_tree.lock);
2347 em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
2348 if (em)
2349 remove_extent_mapping(&tree->map_tree, em);
2350 spin_unlock(&tree->map_tree.lock);
2351 if (!em)
2352 break;
2353 kfree(em->bdev);
2354 /* once for us */
2355 free_extent_map(em);
2356 /* once for the tree */
2357 free_extent_map(em);
2358 }
2359}
2360
2361int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
2362{
2363 struct extent_map *em;
2364 struct map_lookup *map;
2365 struct extent_map_tree *em_tree = &map_tree->map_tree;
2366 int ret;
2367
2368 spin_lock(&em_tree->lock);
2369 em = lookup_extent_mapping(em_tree, logical, len);
2370 spin_unlock(&em_tree->lock);
2371 BUG_ON(!em);
2372
2373 BUG_ON(em->start > logical || em->start + em->len < logical);
2374 map = (struct map_lookup *)em->bdev;
2375 if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
2376 ret = map->num_stripes;
2377 else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
2378 ret = map->sub_stripes;
2379 else
2380 ret = 1;
2381 free_extent_map(em);
2382 return ret;
2383}
2384
2385static int find_live_mirror(struct map_lookup *map, int first, int num,
2386 int optimal)
2387{
2388 int i;
2389 if (map->stripes[optimal].dev->bdev)
2390 return optimal;
2391 for (i = first; i < first + num; i++) {
2392 if (map->stripes[i].dev->bdev)
2393 return i;
2394 }
2395 /* we couldn't find one that doesn't fail. Just return something
2396 * and the io error handling code will clean up eventually
2397 */
2398 return optimal;
2399}
2400
2401static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2402 u64 logical, u64 *length,
2403 struct btrfs_multi_bio **multi_ret,
2404 int mirror_num, struct page *unplug_page)
2405{
2406 struct extent_map *em;
2407 struct map_lookup *map;
2408 struct extent_map_tree *em_tree = &map_tree->map_tree;
2409 u64 offset;
2410 u64 stripe_offset;
2411 u64 stripe_nr;
2412 int stripes_allocated = 8;
2413 int stripes_required = 1;
2414 int stripe_index;
2415 int i;
2416 int num_stripes;
2417 int max_errors = 0;
2418 struct btrfs_multi_bio *multi = NULL;
2419
2420 if (multi_ret && !(rw & (1 << BIO_RW)))
2421 stripes_allocated = 1;
2422again:
2423 if (multi_ret) {
2424 multi = kzalloc(btrfs_multi_bio_size(stripes_allocated),
2425 GFP_NOFS);
2426 if (!multi)
2427 return -ENOMEM;
2428
2429 atomic_set(&multi->error, 0);
2430 }
2431
2432 spin_lock(&em_tree->lock);
2433 em = lookup_extent_mapping(em_tree, logical, *length);
2434 spin_unlock(&em_tree->lock);
2435
2436 if (!em && unplug_page)
2437 return 0;
2438
2439 if (!em) {
2440 printk(KERN_CRIT "unable to find logical %llu len %llu\n",
2441 (unsigned long long)logical,
2442 (unsigned long long)*length);
2443 BUG();
2444 }
2445
2446 BUG_ON(em->start > logical || em->start + em->len < logical);
2447 map = (struct map_lookup *)em->bdev;
2448 offset = logical - em->start;
2449
2450 if (mirror_num > map->num_stripes)
2451 mirror_num = 0;
2452
2453 /* if our multi bio struct is too small, back off and try again */
2454 if (rw & (1 << BIO_RW)) {
2455 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
2456 BTRFS_BLOCK_GROUP_DUP)) {
2457 stripes_required = map->num_stripes;
2458 max_errors = 1;
2459 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
2460 stripes_required = map->sub_stripes;
2461 max_errors = 1;
2462 }
2463 }
2464 if (multi_ret && rw == WRITE &&
2465 stripes_allocated < stripes_required) {
2466 stripes_allocated = map->num_stripes;
2467 free_extent_map(em);
2468 kfree(multi);
2469 goto again;
2470 }
2471 stripe_nr = offset;
2472 /*
2473 * stripe_nr counts the total number of stripes we have to stride
2474 * to get to this block
2475 */
2476 do_div(stripe_nr, map->stripe_len);
2477
2478 stripe_offset = stripe_nr * map->stripe_len;
2479 BUG_ON(offset < stripe_offset);
2480
2481 /* stripe_offset is the offset of this block in its stripe*/
2482 stripe_offset = offset - stripe_offset;
2483
2484 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
2485 BTRFS_BLOCK_GROUP_RAID10 |
2486 BTRFS_BLOCK_GROUP_DUP)) {
2487 /* we limit the length of each bio to what fits in a stripe */
2488 *length = min_t(u64, em->len - offset,
2489 map->stripe_len - stripe_offset);
2490 } else {
2491 *length = em->len - offset;
2492 }
2493
2494 if (!multi_ret && !unplug_page)
2495 goto out;
2496
2497 num_stripes = 1;
2498 stripe_index = 0;
2499 if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
2500 if (unplug_page || (rw & (1 << BIO_RW)))
2501 num_stripes = map->num_stripes;
2502 else if (mirror_num)
2503 stripe_index = mirror_num - 1;
2504 else {
2505 stripe_index = find_live_mirror(map, 0,
2506 map->num_stripes,
2507 current->pid % map->num_stripes);
2508 }
2509
2510 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
2511 if (rw & (1 << BIO_RW))
2512 num_stripes = map->num_stripes;
2513 else if (mirror_num)
2514 stripe_index = mirror_num - 1;
2515
2516 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
2517 int factor = map->num_stripes / map->sub_stripes;
2518
2519 stripe_index = do_div(stripe_nr, factor);
2520 stripe_index *= map->sub_stripes;
2521
2522 if (unplug_page || (rw & (1 << BIO_RW)))
2523 num_stripes = map->sub_stripes;
2524 else if (mirror_num)
2525 stripe_index += mirror_num - 1;
2526 else {
2527 stripe_index = find_live_mirror(map, stripe_index,
2528 map->sub_stripes, stripe_index +
2529 current->pid % map->sub_stripes);
2530 }
2531 } else {
2532 /*
2533 * after this do_div call, stripe_nr is the number of stripes
2534 * on this device we have to walk to find the data, and
2535 * stripe_index is the number of our device in the stripe array
2536 */
2537 stripe_index = do_div(stripe_nr, map->num_stripes);
2538 }
2539 BUG_ON(stripe_index >= map->num_stripes);
2540
2541 for (i = 0; i < num_stripes; i++) {
2542 if (unplug_page) {
2543 struct btrfs_device *device;
2544 struct backing_dev_info *bdi;
2545
2546 device = map->stripes[stripe_index].dev;
2547 if (device->bdev) {
2548 bdi = blk_get_backing_dev_info(device->bdev);
2549 if (bdi->unplug_io_fn)
2550 bdi->unplug_io_fn(bdi, unplug_page);
2551 }
2552 } else {
2553 multi->stripes[i].physical =
2554 map->stripes[stripe_index].physical +
2555 stripe_offset + stripe_nr * map->stripe_len;
2556 multi->stripes[i].dev = map->stripes[stripe_index].dev;
2557 }
2558 stripe_index++;
2559 }
2560 if (multi_ret) {
2561 *multi_ret = multi;
2562 multi->num_stripes = num_stripes;
2563 multi->max_errors = max_errors;
2564 }
2565out:
2566 free_extent_map(em);
2567 return 0;
2568}
2569
2570int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2571 u64 logical, u64 *length,
2572 struct btrfs_multi_bio **multi_ret, int mirror_num)
2573{
2574 return __btrfs_map_block(map_tree, rw, logical, length, multi_ret,
2575 mirror_num, NULL);
2576}
2577
2578int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
2579 u64 chunk_start, u64 physical, u64 devid,
2580 u64 **logical, int *naddrs, int *stripe_len)
2581{
2582 struct extent_map_tree *em_tree = &map_tree->map_tree;
2583 struct extent_map *em;
2584 struct map_lookup *map;
2585 u64 *buf;
2586 u64 bytenr;
2587 u64 length;
2588 u64 stripe_nr;
2589 int i, j, nr = 0;
2590
2591 spin_lock(&em_tree->lock);
2592 em = lookup_extent_mapping(em_tree, chunk_start, 1);
2593 spin_unlock(&em_tree->lock);
2594
2595 BUG_ON(!em || em->start != chunk_start);
2596 map = (struct map_lookup *)em->bdev;
2597
2598 length = em->len;
2599 if (map->type & BTRFS_BLOCK_GROUP_RAID10)
2600 do_div(length, map->num_stripes / map->sub_stripes);
2601 else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
2602 do_div(length, map->num_stripes);
2603
2604 buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
2605 BUG_ON(!buf);
2606
2607 for (i = 0; i < map->num_stripes; i++) {
2608 if (devid && map->stripes[i].dev->devid != devid)
2609 continue;
2610 if (map->stripes[i].physical > physical ||
2611 map->stripes[i].physical + length <= physical)
2612 continue;
2613
2614 stripe_nr = physical - map->stripes[i].physical;
2615 do_div(stripe_nr, map->stripe_len);
2616
2617 if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
2618 stripe_nr = stripe_nr * map->num_stripes + i;
2619 do_div(stripe_nr, map->sub_stripes);
2620 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
2621 stripe_nr = stripe_nr * map->num_stripes + i;
2622 }
2623 bytenr = chunk_start + stripe_nr * map->stripe_len;
2624 WARN_ON(nr >= map->num_stripes);
2625 for (j = 0; j < nr; j++) {
2626 if (buf[j] == bytenr)
2627 break;
2628 }
2629 if (j == nr) {
2630 WARN_ON(nr >= map->num_stripes);
2631 buf[nr++] = bytenr;
2632 }
2633 }
2634
2635 for (i = 0; i > nr; i++) {
2636 struct btrfs_multi_bio *multi;
2637 struct btrfs_bio_stripe *stripe;
2638 int ret;
2639
2640 length = 1;
2641 ret = btrfs_map_block(map_tree, WRITE, buf[i],
2642 &length, &multi, 0);
2643 BUG_ON(ret);
2644
2645 stripe = multi->stripes;
2646 for (j = 0; j < multi->num_stripes; j++) {
2647 if (stripe->physical >= physical &&
2648 physical < stripe->physical + length)
2649 break;
2650 }
2651 BUG_ON(j >= multi->num_stripes);
2652 kfree(multi);
2653 }
2654
2655 *logical = buf;
2656 *naddrs = nr;
2657 *stripe_len = map->stripe_len;
2658
2659 free_extent_map(em);
2660 return 0;
2661}
2662
2663int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
2664 u64 logical, struct page *page)
2665{
2666 u64 length = PAGE_CACHE_SIZE;
2667 return __btrfs_map_block(map_tree, READ, logical, &length,
2668 NULL, 0, page);
2669}
2670
2671static void end_bio_multi_stripe(struct bio *bio, int err)
2672{
2673 struct btrfs_multi_bio *multi = bio->bi_private;
2674 int is_orig_bio = 0;
2675
2676 if (err)
2677 atomic_inc(&multi->error);
2678
2679 if (bio == multi->orig_bio)
2680 is_orig_bio = 1;
2681
2682 if (atomic_dec_and_test(&multi->stripes_pending)) {
2683 if (!is_orig_bio) {
2684 bio_put(bio);
2685 bio = multi->orig_bio;
2686 }
2687 bio->bi_private = multi->private;
2688 bio->bi_end_io = multi->end_io;
2689 /* only send an error to the higher layers if it is
2690 * beyond the tolerance of the multi-bio
2691 */
2692 if (atomic_read(&multi->error) > multi->max_errors) {
2693 err = -EIO;
2694 } else if (err) {
2695 /*
2696 * this bio is actually up to date, we didn't
2697 * go over the max number of errors
2698 */
2699 set_bit(BIO_UPTODATE, &bio->bi_flags);
2700 err = 0;
2701 }
2702 kfree(multi);
2703
2704 bio_endio(bio, err);
2705 } else if (!is_orig_bio) {
2706 bio_put(bio);
2707 }
2708}
2709
2710struct async_sched {
2711 struct bio *bio;
2712 int rw;
2713 struct btrfs_fs_info *info;
2714 struct btrfs_work work;
2715};
2716
2717/*
2718 * see run_scheduled_bios for a description of why bios are collected for
2719 * async submit.
2720 *
2721 * This will add one bio to the pending list for a device and make sure
2722 * the work struct is scheduled.
2723 */
2724static noinline int schedule_bio(struct btrfs_root *root,
2725 struct btrfs_device *device,
2726 int rw, struct bio *bio)
2727{
2728 int should_queue = 1;
2729
2730 /* don't bother with additional async steps for reads, right now */
2731 if (!(rw & (1 << BIO_RW))) {
2732 bio_get(bio);
2733 submit_bio(rw, bio);
2734 bio_put(bio);
2735 return 0;
2736 }
2737
2738 /*
2739 * nr_async_bios allows us to reliably return congestion to the
2740 * higher layers. Otherwise, the async bio makes it appear we have
2741 * made progress against dirty pages when we've really just put it
2742 * on a queue for later
2743 */
2744 atomic_inc(&root->fs_info->nr_async_bios);
2745 WARN_ON(bio->bi_next);
2746 bio->bi_next = NULL;
2747 bio->bi_rw |= rw;
2748
2749 spin_lock(&device->io_lock);
2750
2751 if (device->pending_bio_tail)
2752 device->pending_bio_tail->bi_next = bio;
2753
2754 device->pending_bio_tail = bio;
2755 if (!device->pending_bios)
2756 device->pending_bios = bio;
2757 if (device->running_pending)
2758 should_queue = 0;
2759
2760 spin_unlock(&device->io_lock);
2761
2762 if (should_queue)
2763 btrfs_queue_worker(&root->fs_info->submit_workers,
2764 &device->work);
2765 return 0;
2766}
2767
2768int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
2769 int mirror_num, int async_submit)
2770{
2771 struct btrfs_mapping_tree *map_tree;
2772 struct btrfs_device *dev;
2773 struct bio *first_bio = bio;
2774 u64 logical = (u64)bio->bi_sector << 9;
2775 u64 length = 0;
2776 u64 map_length;
2777 struct btrfs_multi_bio *multi = NULL;
2778 int ret;
2779 int dev_nr = 0;
2780 int total_devs = 1;
2781
2782 length = bio->bi_size;
2783 map_tree = &root->fs_info->mapping_tree;
2784 map_length = length;
2785
2786 ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi,
2787 mirror_num);
2788 BUG_ON(ret);
2789
2790 total_devs = multi->num_stripes;
2791 if (map_length < length) {
2792 printk(KERN_CRIT "mapping failed logical %llu bio len %llu "
2793 "len %llu\n", (unsigned long long)logical,
2794 (unsigned long long)length,
2795 (unsigned long long)map_length);
2796 BUG();
2797 }
2798 multi->end_io = first_bio->bi_end_io;
2799 multi->private = first_bio->bi_private;
2800 multi->orig_bio = first_bio;
2801 atomic_set(&multi->stripes_pending, multi->num_stripes);
2802
2803 while (dev_nr < total_devs) {
2804 if (total_devs > 1) {
2805 if (dev_nr < total_devs - 1) {
2806 bio = bio_clone(first_bio, GFP_NOFS);
2807 BUG_ON(!bio);
2808 } else {
2809 bio = first_bio;
2810 }
2811 bio->bi_private = multi;
2812 bio->bi_end_io = end_bio_multi_stripe;
2813 }
2814 bio->bi_sector = multi->stripes[dev_nr].physical >> 9;
2815 dev = multi->stripes[dev_nr].dev;
2816 BUG_ON(rw == WRITE && !dev->writeable);
2817 if (dev && dev->bdev) {
2818 bio->bi_bdev = dev->bdev;
2819 if (async_submit)
2820 schedule_bio(root, dev, rw, bio);
2821 else
2822 submit_bio(rw, bio);
2823 } else {
2824 bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
2825 bio->bi_sector = logical >> 9;
2826 bio_endio(bio, -EIO);
2827 }
2828 dev_nr++;
2829 }
2830 if (total_devs == 1)
2831 kfree(multi);
2832 return 0;
2833}
2834
2835struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
2836 u8 *uuid, u8 *fsid)
2837{
2838 struct btrfs_device *device;
2839 struct btrfs_fs_devices *cur_devices;
2840
2841 cur_devices = root->fs_info->fs_devices;
2842 while (cur_devices) {
2843 if (!fsid ||
2844 !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
2845 device = __find_device(&cur_devices->devices,
2846 devid, uuid);
2847 if (device)
2848 return device;
2849 }
2850 cur_devices = cur_devices->seed;
2851 }
2852 return NULL;
2853}
2854
2855static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
2856 u64 devid, u8 *dev_uuid)
2857{
2858 struct btrfs_device *device;
2859 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
2860
2861 device = kzalloc(sizeof(*device), GFP_NOFS);
2862 if (!device)
2863 return NULL;
2864 list_add(&device->dev_list,
2865 &fs_devices->devices);
2866 device->barriers = 1;
2867 device->dev_root = root->fs_info->dev_root;
2868 device->devid = devid;
2869 device->work.func = pending_bios_fn;
2870 device->fs_devices = fs_devices;
2871 fs_devices->num_devices++;
2872 spin_lock_init(&device->io_lock);
2873 INIT_LIST_HEAD(&device->dev_alloc_list);
2874 memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);
2875 return device;
2876}
2877
2878static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
2879 struct extent_buffer *leaf,
2880 struct btrfs_chunk *chunk)
2881{
2882 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
2883 struct map_lookup *map;
2884 struct extent_map *em;
2885 u64 logical;
2886 u64 length;
2887 u64 devid;
2888 u8 uuid[BTRFS_UUID_SIZE];
2889 int num_stripes;
2890 int ret;
2891 int i;
2892
2893 logical = key->offset;
2894 length = btrfs_chunk_length(leaf, chunk);
2895
2896 spin_lock(&map_tree->map_tree.lock);
2897 em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
2898 spin_unlock(&map_tree->map_tree.lock);
2899
2900 /* already mapped? */
2901 if (em && em->start <= logical && em->start + em->len > logical) {
2902 free_extent_map(em);
2903 return 0;
2904 } else if (em) {
2905 free_extent_map(em);
2906 }
2907
2908 map = kzalloc(sizeof(*map), GFP_NOFS);
2909 if (!map)
2910 return -ENOMEM;
2911
2912 em = alloc_extent_map(GFP_NOFS);
2913 if (!em)
2914 return -ENOMEM;
2915 num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
2916 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
2917 if (!map) {
2918 free_extent_map(em);
2919 return -ENOMEM;
2920 }
2921
2922 em->bdev = (struct block_device *)map;
2923 em->start = logical;
2924 em->len = length;
2925 em->block_start = 0;
2926 em->block_len = em->len;
2927
2928 map->num_stripes = num_stripes;
2929 map->io_width = btrfs_chunk_io_width(leaf, chunk);
2930 map->io_align = btrfs_chunk_io_align(leaf, chunk);
2931 map->sector_size = btrfs_chunk_sector_size(leaf, chunk);
2932 map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
2933 map->type = btrfs_chunk_type(leaf, chunk);
2934 map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
2935 for (i = 0; i < num_stripes; i++) {
2936 map->stripes[i].physical =
2937 btrfs_stripe_offset_nr(leaf, chunk, i);
2938 devid = btrfs_stripe_devid_nr(leaf, chunk, i);
2939 read_extent_buffer(leaf, uuid, (unsigned long)
2940 btrfs_stripe_dev_uuid_nr(chunk, i),
2941 BTRFS_UUID_SIZE);
2942 map->stripes[i].dev = btrfs_find_device(root, devid, uuid,
2943 NULL);
2944 if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
2945 kfree(map);
2946 free_extent_map(em);
2947 return -EIO;
2948 }
2949 if (!map->stripes[i].dev) {
2950 map->stripes[i].dev =
2951 add_missing_dev(root, devid, uuid);
2952 if (!map->stripes[i].dev) {
2953 kfree(map);
2954 free_extent_map(em);
2955 return -EIO;
2956 }
2957 }
2958 map->stripes[i].dev->in_fs_metadata = 1;
2959 }
2960
2961 spin_lock(&map_tree->map_tree.lock);
2962 ret = add_extent_mapping(&map_tree->map_tree, em);
2963 spin_unlock(&map_tree->map_tree.lock);
2964 BUG_ON(ret);
2965 free_extent_map(em);
2966
2967 return 0;
2968}
2969
2970static int fill_device_from_item(struct extent_buffer *leaf,
2971 struct btrfs_dev_item *dev_item,
2972 struct btrfs_device *device)
2973{
2974 unsigned long ptr;
2975
2976 device->devid = btrfs_device_id(leaf, dev_item);
2977 device->total_bytes = btrfs_device_total_bytes(leaf, dev_item);
2978 device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
2979 device->type = btrfs_device_type(leaf, dev_item);
2980 device->io_align = btrfs_device_io_align(leaf, dev_item);
2981 device->io_width = btrfs_device_io_width(leaf, dev_item);
2982 device->sector_size = btrfs_device_sector_size(leaf, dev_item);
2983
2984 ptr = (unsigned long)btrfs_device_uuid(dev_item);
2985 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
2986
2987 return 0;
2988}
2989
2990static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
2991{
2992 struct btrfs_fs_devices *fs_devices;
2993 int ret;
2994
2995 mutex_lock(&uuid_mutex);
2996
2997 fs_devices = root->fs_info->fs_devices->seed;
2998 while (fs_devices) {
2999 if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
3000 ret = 0;
3001 goto out;
3002 }
3003 fs_devices = fs_devices->seed;
3004 }
3005
3006 fs_devices = find_fsid(fsid);
3007 if (!fs_devices) {
3008 ret = -ENOENT;
3009 goto out;
3010 }
3011
3012 fs_devices = clone_fs_devices(fs_devices);
3013 if (IS_ERR(fs_devices)) {
3014 ret = PTR_ERR(fs_devices);
3015 goto out;
3016 }
3017
3018 ret = __btrfs_open_devices(fs_devices, FMODE_READ,
3019 root->fs_info->bdev_holder);
3020 if (ret)
3021 goto out;
3022
3023 if (!fs_devices->seeding) {
3024 __btrfs_close_devices(fs_devices);
3025 free_fs_devices(fs_devices);
3026 ret = -EINVAL;
3027 goto out;
3028 }
3029
3030 fs_devices->seed = root->fs_info->fs_devices->seed;
3031 root->fs_info->fs_devices->seed = fs_devices;
3032out:
3033 mutex_unlock(&uuid_mutex);
3034 return ret;
3035}
3036
3037static int read_one_dev(struct btrfs_root *root,
3038 struct extent_buffer *leaf,
3039 struct btrfs_dev_item *dev_item)
3040{
3041 struct btrfs_device *device;
3042 u64 devid;
3043 int ret;
3044 u8 fs_uuid[BTRFS_UUID_SIZE];
3045 u8 dev_uuid[BTRFS_UUID_SIZE];
3046
3047 devid = btrfs_device_id(leaf, dev_item);
3048 read_extent_buffer(leaf, dev_uuid,
3049 (unsigned long)btrfs_device_uuid(dev_item),
3050 BTRFS_UUID_SIZE);
3051 read_extent_buffer(leaf, fs_uuid,
3052 (unsigned long)btrfs_device_fsid(dev_item),
3053 BTRFS_UUID_SIZE);
3054
3055 if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) {
3056 ret = open_seed_devices(root, fs_uuid);
3057 if (ret && !btrfs_test_opt(root, DEGRADED))
3058 return ret;
3059 }
3060
3061 device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
3062 if (!device || !device->bdev) {
3063 if (!btrfs_test_opt(root, DEGRADED))
3064 return -EIO;
3065
3066 if (!device) {
3067 printk(KERN_WARNING "warning devid %llu missing\n",
3068 (unsigned long long)devid);
3069 device = add_missing_dev(root, devid, dev_uuid);
3070 if (!device)
3071 return -ENOMEM;
3072 }
3073 }
3074
3075 if (device->fs_devices != root->fs_info->fs_devices) {
3076 BUG_ON(device->writeable);
3077 if (device->generation !=
3078 btrfs_device_generation(leaf, dev_item))
3079 return -EINVAL;
3080 }
3081
3082 fill_device_from_item(leaf, dev_item, device);
3083 device->dev_root = root->fs_info->dev_root;
3084 device->in_fs_metadata = 1;
3085 if (device->writeable)
3086 device->fs_devices->total_rw_bytes += device->total_bytes;
3087 ret = 0;
3088 return ret;
3089}
3090
3091int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf)
3092{
3093 struct btrfs_dev_item *dev_item;
3094
3095 dev_item = (struct btrfs_dev_item *)offsetof(struct btrfs_super_block,
3096 dev_item);
3097 return read_one_dev(root, buf, dev_item);
3098}
3099
3100int btrfs_read_sys_array(struct btrfs_root *root)
3101{
3102 struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
3103 struct extent_buffer *sb;
3104 struct btrfs_disk_key *disk_key;
3105 struct btrfs_chunk *chunk;
3106 u8 *ptr;
3107 unsigned long sb_ptr;
3108 int ret = 0;
3109 u32 num_stripes;
3110 u32 array_size;
3111 u32 len = 0;
3112 u32 cur;
3113 struct btrfs_key key;
3114
3115 sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET,
3116 BTRFS_SUPER_INFO_SIZE);
3117 if (!sb)
3118 return -ENOMEM;
3119 btrfs_set_buffer_uptodate(sb);
3120 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
3121 array_size = btrfs_super_sys_array_size(super_copy);
3122
3123 ptr = super_copy->sys_chunk_array;
3124 sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array);
3125 cur = 0;
3126
3127 while (cur < array_size) {
3128 disk_key = (struct btrfs_disk_key *)ptr;
3129 btrfs_disk_key_to_cpu(&key, disk_key);
3130
3131 len = sizeof(*disk_key); ptr += len;
3132 sb_ptr += len;
3133 cur += len;
3134
3135 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
3136 chunk = (struct btrfs_chunk *)sb_ptr;
3137 ret = read_one_chunk(root, &key, sb, chunk);
3138 if (ret)
3139 break;
3140 num_stripes = btrfs_chunk_num_stripes(sb, chunk);
3141 len = btrfs_chunk_item_size(num_stripes);
3142 } else {
3143 ret = -EIO;
3144 break;
3145 }
3146 ptr += len;
3147 sb_ptr += len;
3148 cur += len;
3149 }
3150 free_extent_buffer(sb);
3151 return ret;
3152}
3153
3154int btrfs_read_chunk_tree(struct btrfs_root *root)
3155{
3156 struct btrfs_path *path;
3157 struct extent_buffer *leaf;
3158 struct btrfs_key key;
3159 struct btrfs_key found_key;
3160 int ret;
3161 int slot;
3162
3163 root = root->fs_info->chunk_root;
3164
3165 path = btrfs_alloc_path();
3166 if (!path)
3167 return -ENOMEM;
3168
3169 /* first we search for all of the device items, and then we
3170 * read in all of the chunk items. This way we can create chunk
3171 * mappings that reference all of the devices that are afound
3172 */
3173 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
3174 key.offset = 0;
3175 key.type = 0;
3176again:
3177 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3178 while (1) {
3179 leaf = path->nodes[0];
3180 slot = path->slots[0];
3181 if (slot >= btrfs_header_nritems(leaf)) {
3182 ret = btrfs_next_leaf(root, path);
3183 if (ret == 0)
3184 continue;
3185 if (ret < 0)
3186 goto error;
3187 break;
3188 }
3189 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3190 if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
3191 if (found_key.objectid != BTRFS_DEV_ITEMS_OBJECTID)
3192 break;
3193 if (found_key.type == BTRFS_DEV_ITEM_KEY) {
3194 struct btrfs_dev_item *dev_item;
3195 dev_item = btrfs_item_ptr(leaf, slot,
3196 struct btrfs_dev_item);
3197 ret = read_one_dev(root, leaf, dev_item);
3198 if (ret)
3199 goto error;
3200 }
3201 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
3202 struct btrfs_chunk *chunk;
3203 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
3204 ret = read_one_chunk(root, &found_key, leaf, chunk);
3205 if (ret)
3206 goto error;
3207 }
3208 path->slots[0]++;
3209 }
3210 if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
3211 key.objectid = 0;
3212 btrfs_release_path(root, path);
3213 goto again;
3214 }
3215 ret = 0;
3216error:
3217 btrfs_free_path(path);
3218 return ret;
3219}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
new file mode 100644
index 000000000000..86c44e9ae110
--- /dev/null
+++ b/fs/btrfs/volumes.h
@@ -0,0 +1,162 @@
1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_VOLUMES_
20#define __BTRFS_VOLUMES_
21
22#include <linux/bio.h>
23#include "async-thread.h"
24
25struct buffer_head;
26struct btrfs_device {
27 struct list_head dev_list;
28 struct list_head dev_alloc_list;
29 struct btrfs_fs_devices *fs_devices;
30 struct btrfs_root *dev_root;
31 struct bio *pending_bios;
32 struct bio *pending_bio_tail;
33 int running_pending;
34 u64 generation;
35
36 int barriers;
37 int writeable;
38 int in_fs_metadata;
39
40 spinlock_t io_lock;
41
42 struct block_device *bdev;
43
44 /* the mode sent to open_bdev_exclusive */
45 fmode_t mode;
46
47 char *name;
48
49 /* the internal btrfs device id */
50 u64 devid;
51
52 /* size of the device */
53 u64 total_bytes;
54
55 /* bytes used */
56 u64 bytes_used;
57
58 /* optimal io alignment for this device */
59 u32 io_align;
60
61 /* optimal io width for this device */
62 u32 io_width;
63
64 /* minimal io size for this device */
65 u32 sector_size;
66
67 /* type and info about this device */
68 u64 type;
69
70 /* physical drive uuid (or lvm uuid) */
71 u8 uuid[BTRFS_UUID_SIZE];
72
73 struct btrfs_work work;
74};
75
76struct btrfs_fs_devices {
77 u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
78
79 /* the device with this id has the most recent coyp of the super */
80 u64 latest_devid;
81 u64 latest_trans;
82 u64 num_devices;
83 u64 open_devices;
84 u64 rw_devices;
85 u64 total_rw_bytes;
86 struct block_device *latest_bdev;
87 /* all of the devices in the FS */
88 struct list_head devices;
89
90 /* devices not currently being allocated */
91 struct list_head alloc_list;
92 struct list_head list;
93
94 struct btrfs_fs_devices *seed;
95 int seeding;
96
97 int opened;
98};
99
100struct btrfs_bio_stripe {
101 struct btrfs_device *dev;
102 u64 physical;
103};
104
105struct btrfs_multi_bio {
106 atomic_t stripes_pending;
107 bio_end_io_t *end_io;
108 struct bio *orig_bio;
109 void *private;
110 atomic_t error;
111 int max_errors;
112 int num_stripes;
113 struct btrfs_bio_stripe stripes[];
114};
115
116#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \
117 (sizeof(struct btrfs_bio_stripe) * (n)))
118
119int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
120 struct btrfs_device *device,
121 u64 chunk_tree, u64 chunk_objectid,
122 u64 chunk_offset, u64 start, u64 num_bytes);
123int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
124 u64 logical, u64 *length,
125 struct btrfs_multi_bio **multi_ret, int mirror_num);
126int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
127 u64 chunk_start, u64 physical, u64 devid,
128 u64 **logical, int *naddrs, int *stripe_len);
129int btrfs_read_sys_array(struct btrfs_root *root);
130int btrfs_read_chunk_tree(struct btrfs_root *root);
131int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
132 struct btrfs_root *extent_root, u64 type);
133void btrfs_mapping_init(struct btrfs_mapping_tree *tree);
134void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree);
135int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
136 int mirror_num, int async_submit);
137int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf);
138int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
139 fmode_t flags, void *holder);
140int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
141 struct btrfs_fs_devices **fs_devices_ret);
142int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
143int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices);
144int btrfs_add_device(struct btrfs_trans_handle *trans,
145 struct btrfs_root *root,
146 struct btrfs_device *device);
147int btrfs_rm_device(struct btrfs_root *root, char *device_path);
148int btrfs_cleanup_fs_uuids(void);
149int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len);
150int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
151 u64 logical, struct page *page);
152int btrfs_grow_device(struct btrfs_trans_handle *trans,
153 struct btrfs_device *device, u64 new_size);
154struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
155 u8 *uuid, u8 *fsid);
156int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
157int btrfs_init_new_device(struct btrfs_root *root, char *path);
158int btrfs_balance(struct btrfs_root *dev_root);
159void btrfs_unlock_volumes(void);
160void btrfs_lock_volumes(void);
161int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
162#endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
new file mode 100644
index 000000000000..7f332e270894
--- /dev/null
+++ b/fs/btrfs/xattr.c
@@ -0,0 +1,322 @@
1/*
2 * Copyright (C) 2007 Red Hat. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/init.h>
20#include <linux/fs.h>
21#include <linux/slab.h>
22#include <linux/rwsem.h>
23#include <linux/xattr.h>
24#include "ctree.h"
25#include "btrfs_inode.h"
26#include "transaction.h"
27#include "xattr.h"
28#include "disk-io.h"
29
30
31ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
32 void *buffer, size_t size)
33{
34 struct btrfs_dir_item *di;
35 struct btrfs_root *root = BTRFS_I(inode)->root;
36 struct btrfs_path *path;
37 struct extent_buffer *leaf;
38 int ret = 0;
39 unsigned long data_ptr;
40
41 path = btrfs_alloc_path();
42 if (!path)
43 return -ENOMEM;
44
45 /* lookup the xattr by name */
46 di = btrfs_lookup_xattr(NULL, root, path, inode->i_ino, name,
47 strlen(name), 0);
48 if (!di || IS_ERR(di)) {
49 ret = -ENODATA;
50 goto out;
51 }
52
53 leaf = path->nodes[0];
54 /* if size is 0, that means we want the size of the attr */
55 if (!size) {
56 ret = btrfs_dir_data_len(leaf, di);
57 goto out;
58 }
59
60 /* now get the data out of our dir_item */
61 if (btrfs_dir_data_len(leaf, di) > size) {
62 ret = -ERANGE;
63 goto out;
64 }
65 data_ptr = (unsigned long)((char *)(di + 1) +
66 btrfs_dir_name_len(leaf, di));
67 read_extent_buffer(leaf, buffer, data_ptr,
68 btrfs_dir_data_len(leaf, di));
69 ret = btrfs_dir_data_len(leaf, di);
70
71out:
72 btrfs_free_path(path);
73 return ret;
74}
75
76int __btrfs_setxattr(struct inode *inode, const char *name,
77 const void *value, size_t size, int flags)
78{
79 struct btrfs_dir_item *di;
80 struct btrfs_root *root = BTRFS_I(inode)->root;
81 struct btrfs_trans_handle *trans;
82 struct btrfs_path *path;
83 int ret = 0, mod = 0;
84
85 path = btrfs_alloc_path();
86 if (!path)
87 return -ENOMEM;
88
89 trans = btrfs_start_transaction(root, 1);
90 btrfs_set_trans_block_group(trans, inode);
91
92 /* first lets see if we already have this xattr */
93 di = btrfs_lookup_xattr(trans, root, path, inode->i_ino, name,
94 strlen(name), -1);
95 if (IS_ERR(di)) {
96 ret = PTR_ERR(di);
97 goto out;
98 }
99
100 /* ok we already have this xattr, lets remove it */
101 if (di) {
102 /* if we want create only exit */
103 if (flags & XATTR_CREATE) {
104 ret = -EEXIST;
105 goto out;
106 }
107
108 ret = btrfs_delete_one_dir_name(trans, root, path, di);
109 if (ret)
110 goto out;
111 btrfs_release_path(root, path);
112
113 /* if we don't have a value then we are removing the xattr */
114 if (!value) {
115 mod = 1;
116 goto out;
117 }
118 } else {
119 btrfs_release_path(root, path);
120
121 if (flags & XATTR_REPLACE) {
122 /* we couldn't find the attr to replace */
123 ret = -ENODATA;
124 goto out;
125 }
126 }
127
128 /* ok we have to create a completely new xattr */
129 ret = btrfs_insert_xattr_item(trans, root, name, strlen(name),
130 value, size, inode->i_ino);
131 if (ret)
132 goto out;
133 mod = 1;
134
135out:
136 if (mod) {
137 inode->i_ctime = CURRENT_TIME;
138 ret = btrfs_update_inode(trans, root, inode);
139 }
140
141 btrfs_end_transaction(trans, root);
142 btrfs_free_path(path);
143 return ret;
144}
145
146ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
147{
148 struct btrfs_key key, found_key;
149 struct inode *inode = dentry->d_inode;
150 struct btrfs_root *root = BTRFS_I(inode)->root;
151 struct btrfs_path *path;
152 struct btrfs_item *item;
153 struct extent_buffer *leaf;
154 struct btrfs_dir_item *di;
155 int ret = 0, slot, advance;
156 size_t total_size = 0, size_left = size;
157 unsigned long name_ptr;
158 size_t name_len;
159 u32 nritems;
160
161 /*
162 * ok we want all objects associated with this id.
163 * NOTE: we set key.offset = 0; because we want to start with the
164 * first xattr that we find and walk forward
165 */
166 key.objectid = inode->i_ino;
167 btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
168 key.offset = 0;
169
170 path = btrfs_alloc_path();
171 if (!path)
172 return -ENOMEM;
173 path->reada = 2;
174
175 /* search for our xattrs */
176 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
177 if (ret < 0)
178 goto err;
179 ret = 0;
180 advance = 0;
181 while (1) {
182 leaf = path->nodes[0];
183 nritems = btrfs_header_nritems(leaf);
184 slot = path->slots[0];
185
186 /* this is where we start walking through the path */
187 if (advance || slot >= nritems) {
188 /*
189 * if we've reached the last slot in this leaf we need
190 * to go to the next leaf and reset everything
191 */
192 if (slot >= nritems-1) {
193 ret = btrfs_next_leaf(root, path);
194 if (ret)
195 break;
196 leaf = path->nodes[0];
197 nritems = btrfs_header_nritems(leaf);
198 slot = path->slots[0];
199 } else {
200 /*
201 * just walking through the slots on this leaf
202 */
203 slot++;
204 path->slots[0]++;
205 }
206 }
207 advance = 1;
208
209 item = btrfs_item_nr(leaf, slot);
210 btrfs_item_key_to_cpu(leaf, &found_key, slot);
211
212 /* check to make sure this item is what we want */
213 if (found_key.objectid != key.objectid)
214 break;
215 if (btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY)
216 break;
217
218 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
219
220 name_len = btrfs_dir_name_len(leaf, di);
221 total_size += name_len + 1;
222
223 /* we are just looking for how big our buffer needs to be */
224 if (!size)
225 continue;
226
227 if (!buffer || (name_len + 1) > size_left) {
228 ret = -ERANGE;
229 goto err;
230 }
231
232 name_ptr = (unsigned long)(di + 1);
233 read_extent_buffer(leaf, buffer, name_ptr, name_len);
234 buffer[name_len] = '\0';
235
236 size_left -= name_len + 1;
237 buffer += name_len + 1;
238 }
239 ret = total_size;
240
241err:
242 btrfs_free_path(path);
243
244 return ret;
245}
246
247/*
248 * List of handlers for synthetic system.* attributes. All real ondisk
249 * attributes are handled directly.
250 */
251struct xattr_handler *btrfs_xattr_handlers[] = {
252#ifdef CONFIG_FS_POSIX_ACL
253 &btrfs_xattr_acl_access_handler,
254 &btrfs_xattr_acl_default_handler,
255#endif
256 NULL,
257};
258
259/*
260 * Check if the attribute is in a supported namespace.
261 *
262 * This applied after the check for the synthetic attributes in the system
263 * namespace.
264 */
265static bool btrfs_is_valid_xattr(const char *name)
266{
267 return !strncmp(name, XATTR_SECURITY_PREFIX,
268 XATTR_SECURITY_PREFIX_LEN) ||
269 !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) ||
270 !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
271 !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
272}
273
274ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
275 void *buffer, size_t size)
276{
277 /*
278 * If this is a request for a synthetic attribute in the system.*
279 * namespace use the generic infrastructure to resolve a handler
280 * for it via sb->s_xattr.
281 */
282 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
283 return generic_getxattr(dentry, name, buffer, size);
284
285 if (!btrfs_is_valid_xattr(name))
286 return -EOPNOTSUPP;
287 return __btrfs_getxattr(dentry->d_inode, name, buffer, size);
288}
289
290int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
291 size_t size, int flags)
292{
293 /*
294 * If this is a request for a synthetic attribute in the system.*
295 * namespace use the generic infrastructure to resolve a handler
296 * for it via sb->s_xattr.
297 */
298 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
299 return generic_setxattr(dentry, name, value, size, flags);
300
301 if (!btrfs_is_valid_xattr(name))
302 return -EOPNOTSUPP;
303
304 if (size == 0)
305 value = ""; /* empty EA, do not remove */
306 return __btrfs_setxattr(dentry->d_inode, name, value, size, flags);
307}
308
309int btrfs_removexattr(struct dentry *dentry, const char *name)
310{
311 /*
312 * If this is a request for a synthetic attribute in the system.*
313 * namespace use the generic infrastructure to resolve a handler
314 * for it via sb->s_xattr.
315 */
316 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
317 return generic_removexattr(dentry, name);
318
319 if (!btrfs_is_valid_xattr(name))
320 return -EOPNOTSUPP;
321 return __btrfs_setxattr(dentry->d_inode, name, NULL, 0, XATTR_REPLACE);
322}
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
new file mode 100644
index 000000000000..5b1d08f8e68d
--- /dev/null
+++ b/fs/btrfs/xattr.h
@@ -0,0 +1,39 @@
1/*
2 * Copyright (C) 2007 Red Hat. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __XATTR__
20#define __XATTR__
21
22#include <linux/xattr.h>
23
24extern struct xattr_handler btrfs_xattr_acl_access_handler;
25extern struct xattr_handler btrfs_xattr_acl_default_handler;
26extern struct xattr_handler *btrfs_xattr_handlers[];
27
28extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
29 void *buffer, size_t size);
30extern int __btrfs_setxattr(struct inode *inode, const char *name,
31 const void *value, size_t size, int flags);
32
33extern ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
34 void *buffer, size_t size);
35extern int btrfs_setxattr(struct dentry *dentry, const char *name,
36 const void *value, size_t size, int flags);
37extern int btrfs_removexattr(struct dentry *dentry, const char *name);
38
39#endif /* __XATTR__ */
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
new file mode 100644
index 000000000000..ecfbce836d32
--- /dev/null
+++ b/fs/btrfs/zlib.c
@@ -0,0 +1,632 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 *
18 * Based on jffs2 zlib code:
19 * Copyright © 2001-2007 Red Hat, Inc.
20 * Created by David Woodhouse <dwmw2@infradead.org>
21 */
22
23#include <linux/kernel.h>
24#include <linux/slab.h>
25#include <linux/zlib.h>
26#include <linux/zutil.h>
27#include <linux/vmalloc.h>
28#include <linux/init.h>
29#include <linux/err.h>
30#include <linux/sched.h>
31#include <linux/pagemap.h>
32#include <linux/bio.h>
33#include "compression.h"
34
35/* Plan: call deflate() with avail_in == *sourcelen,
36 avail_out = *dstlen - 12 and flush == Z_FINISH.
37 If it doesn't manage to finish, call it again with
38 avail_in == 0 and avail_out set to the remaining 12
39 bytes for it to clean up.
40 Q: Is 12 bytes sufficient?
41*/
42#define STREAM_END_SPACE 12
43
44struct workspace {
45 z_stream inf_strm;
46 z_stream def_strm;
47 char *buf;
48 struct list_head list;
49};
50
51static LIST_HEAD(idle_workspace);
52static DEFINE_SPINLOCK(workspace_lock);
53static unsigned long num_workspace;
54static atomic_t alloc_workspace = ATOMIC_INIT(0);
55static DECLARE_WAIT_QUEUE_HEAD(workspace_wait);
56
57/*
58 * this finds an available zlib workspace or allocates a new one
59 * NULL or an ERR_PTR is returned if things go bad.
60 */
61static struct workspace *find_zlib_workspace(void)
62{
63 struct workspace *workspace;
64 int ret;
65 int cpus = num_online_cpus();
66
67again:
68 spin_lock(&workspace_lock);
69 if (!list_empty(&idle_workspace)) {
70 workspace = list_entry(idle_workspace.next, struct workspace,
71 list);
72 list_del(&workspace->list);
73 num_workspace--;
74 spin_unlock(&workspace_lock);
75 return workspace;
76
77 }
78 spin_unlock(&workspace_lock);
79 if (atomic_read(&alloc_workspace) > cpus) {
80 DEFINE_WAIT(wait);
81 prepare_to_wait(&workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
82 if (atomic_read(&alloc_workspace) > cpus)
83 schedule();
84 finish_wait(&workspace_wait, &wait);
85 goto again;
86 }
87 atomic_inc(&alloc_workspace);
88 workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
89 if (!workspace) {
90 ret = -ENOMEM;
91 goto fail;
92 }
93
94 workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize());
95 if (!workspace->def_strm.workspace) {
96 ret = -ENOMEM;
97 goto fail;
98 }
99 workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize());
100 if (!workspace->inf_strm.workspace) {
101 ret = -ENOMEM;
102 goto fail_inflate;
103 }
104 workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS);
105 if (!workspace->buf) {
106 ret = -ENOMEM;
107 goto fail_kmalloc;
108 }
109 return workspace;
110
111fail_kmalloc:
112 vfree(workspace->inf_strm.workspace);
113fail_inflate:
114 vfree(workspace->def_strm.workspace);
115fail:
116 kfree(workspace);
117 atomic_dec(&alloc_workspace);
118 wake_up(&workspace_wait);
119 return ERR_PTR(ret);
120}
121
122/*
123 * put a workspace struct back on the list or free it if we have enough
124 * idle ones sitting around
125 */
126static int free_workspace(struct workspace *workspace)
127{
128 spin_lock(&workspace_lock);
129 if (num_workspace < num_online_cpus()) {
130 list_add_tail(&workspace->list, &idle_workspace);
131 num_workspace++;
132 spin_unlock(&workspace_lock);
133 if (waitqueue_active(&workspace_wait))
134 wake_up(&workspace_wait);
135 return 0;
136 }
137 spin_unlock(&workspace_lock);
138 vfree(workspace->def_strm.workspace);
139 vfree(workspace->inf_strm.workspace);
140 kfree(workspace->buf);
141 kfree(workspace);
142
143 atomic_dec(&alloc_workspace);
144 if (waitqueue_active(&workspace_wait))
145 wake_up(&workspace_wait);
146 return 0;
147}
148
149/*
150 * cleanup function for module exit
151 */
152static void free_workspaces(void)
153{
154 struct workspace *workspace;
155 while (!list_empty(&idle_workspace)) {
156 workspace = list_entry(idle_workspace.next, struct workspace,
157 list);
158 list_del(&workspace->list);
159 vfree(workspace->def_strm.workspace);
160 vfree(workspace->inf_strm.workspace);
161 kfree(workspace->buf);
162 kfree(workspace);
163 atomic_dec(&alloc_workspace);
164 }
165}
166
167/*
168 * given an address space and start/len, compress the bytes.
169 *
170 * pages are allocated to hold the compressed result and stored
171 * in 'pages'
172 *
173 * out_pages is used to return the number of pages allocated. There
174 * may be pages allocated even if we return an error
175 *
176 * total_in is used to return the number of bytes actually read. It
177 * may be smaller then len if we had to exit early because we
178 * ran out of room in the pages array or because we cross the
179 * max_out threshold.
180 *
181 * total_out is used to return the total number of compressed bytes
182 *
183 * max_out tells us the max number of bytes that we're allowed to
184 * stuff into pages
185 */
186int btrfs_zlib_compress_pages(struct address_space *mapping,
187 u64 start, unsigned long len,
188 struct page **pages,
189 unsigned long nr_dest_pages,
190 unsigned long *out_pages,
191 unsigned long *total_in,
192 unsigned long *total_out,
193 unsigned long max_out)
194{
195 int ret;
196 struct workspace *workspace;
197 char *data_in;
198 char *cpage_out;
199 int nr_pages = 0;
200 struct page *in_page = NULL;
201 struct page *out_page = NULL;
202 int out_written = 0;
203 int in_read = 0;
204 unsigned long bytes_left;
205
206 *out_pages = 0;
207 *total_out = 0;
208 *total_in = 0;
209
210 workspace = find_zlib_workspace();
211 if (!workspace)
212 return -1;
213
214 if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
215 printk(KERN_WARNING "deflateInit failed\n");
216 ret = -1;
217 goto out;
218 }
219
220 workspace->def_strm.total_in = 0;
221 workspace->def_strm.total_out = 0;
222
223 in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
224 data_in = kmap(in_page);
225
226 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
227 cpage_out = kmap(out_page);
228 pages[0] = out_page;
229 nr_pages = 1;
230
231 workspace->def_strm.next_in = data_in;
232 workspace->def_strm.next_out = cpage_out;
233 workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
234 workspace->def_strm.avail_in = min(len, PAGE_CACHE_SIZE);
235
236 out_written = 0;
237 in_read = 0;
238
239 while (workspace->def_strm.total_in < len) {
240 ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH);
241 if (ret != Z_OK) {
242 printk(KERN_DEBUG "btrfs deflate in loop returned %d\n",
243 ret);
244 zlib_deflateEnd(&workspace->def_strm);
245 ret = -1;
246 goto out;
247 }
248
249 /* we're making it bigger, give up */
250 if (workspace->def_strm.total_in > 8192 &&
251 workspace->def_strm.total_in <
252 workspace->def_strm.total_out) {
253 ret = -1;
254 goto out;
255 }
256 /* we need another page for writing out. Test this
257 * before the total_in so we will pull in a new page for
258 * the stream end if required
259 */
260 if (workspace->def_strm.avail_out == 0) {
261 kunmap(out_page);
262 if (nr_pages == nr_dest_pages) {
263 out_page = NULL;
264 ret = -1;
265 goto out;
266 }
267 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
268 cpage_out = kmap(out_page);
269 pages[nr_pages] = out_page;
270 nr_pages++;
271 workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
272 workspace->def_strm.next_out = cpage_out;
273 }
274 /* we're all done */
275 if (workspace->def_strm.total_in >= len)
276 break;
277
278 /* we've read in a full page, get a new one */
279 if (workspace->def_strm.avail_in == 0) {
280 if (workspace->def_strm.total_out > max_out)
281 break;
282
283 bytes_left = len - workspace->def_strm.total_in;
284 kunmap(in_page);
285 page_cache_release(in_page);
286
287 start += PAGE_CACHE_SIZE;
288 in_page = find_get_page(mapping,
289 start >> PAGE_CACHE_SHIFT);
290 data_in = kmap(in_page);
291 workspace->def_strm.avail_in = min(bytes_left,
292 PAGE_CACHE_SIZE);
293 workspace->def_strm.next_in = data_in;
294 }
295 }
296 workspace->def_strm.avail_in = 0;
297 ret = zlib_deflate(&workspace->def_strm, Z_FINISH);
298 zlib_deflateEnd(&workspace->def_strm);
299
300 if (ret != Z_STREAM_END) {
301 ret = -1;
302 goto out;
303 }
304
305 if (workspace->def_strm.total_out >= workspace->def_strm.total_in) {
306 ret = -1;
307 goto out;
308 }
309
310 ret = 0;
311 *total_out = workspace->def_strm.total_out;
312 *total_in = workspace->def_strm.total_in;
313out:
314 *out_pages = nr_pages;
315 if (out_page)
316 kunmap(out_page);
317
318 if (in_page) {
319 kunmap(in_page);
320 page_cache_release(in_page);
321 }
322 free_workspace(workspace);
323 return ret;
324}
325
326/*
327 * pages_in is an array of pages with compressed data.
328 *
329 * disk_start is the starting logical offset of this array in the file
330 *
331 * bvec is a bio_vec of pages from the file that we want to decompress into
332 *
333 * vcnt is the count of pages in the biovec
334 *
335 * srclen is the number of bytes in pages_in
336 *
337 * The basic idea is that we have a bio that was created by readpages.
338 * The pages in the bio are for the uncompressed data, and they may not
339 * be contiguous. They all correspond to the range of bytes covered by
340 * the compressed extent.
341 */
342int btrfs_zlib_decompress_biovec(struct page **pages_in,
343 u64 disk_start,
344 struct bio_vec *bvec,
345 int vcnt,
346 size_t srclen)
347{
348 int ret = 0;
349 int wbits = MAX_WBITS;
350 struct workspace *workspace;
351 char *data_in;
352 size_t total_out = 0;
353 unsigned long page_bytes_left;
354 unsigned long page_in_index = 0;
355 unsigned long page_out_index = 0;
356 struct page *page_out;
357 unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
358 PAGE_CACHE_SIZE;
359 unsigned long buf_start;
360 unsigned long buf_offset;
361 unsigned long bytes;
362 unsigned long working_bytes;
363 unsigned long pg_offset;
364 unsigned long start_byte;
365 unsigned long current_buf_start;
366 char *kaddr;
367
368 workspace = find_zlib_workspace();
369 if (!workspace)
370 return -ENOMEM;
371
372 data_in = kmap(pages_in[page_in_index]);
373 workspace->inf_strm.next_in = data_in;
374 workspace->inf_strm.avail_in = min_t(size_t, srclen, PAGE_CACHE_SIZE);
375 workspace->inf_strm.total_in = 0;
376
377 workspace->inf_strm.total_out = 0;
378 workspace->inf_strm.next_out = workspace->buf;
379 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
380 page_out = bvec[page_out_index].bv_page;
381 page_bytes_left = PAGE_CACHE_SIZE;
382 pg_offset = 0;
383
384 /* If it's deflate, and it's got no preset dictionary, then
385 we can tell zlib to skip the adler32 check. */
386 if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
387 ((data_in[0] & 0x0f) == Z_DEFLATED) &&
388 !(((data_in[0]<<8) + data_in[1]) % 31)) {
389
390 wbits = -((data_in[0] >> 4) + 8);
391 workspace->inf_strm.next_in += 2;
392 workspace->inf_strm.avail_in -= 2;
393 }
394
395 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
396 printk(KERN_WARNING "inflateInit failed\n");
397 ret = -1;
398 goto out;
399 }
400 while (workspace->inf_strm.total_in < srclen) {
401 ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
402 if (ret != Z_OK && ret != Z_STREAM_END)
403 break;
404 /*
405 * buf start is the byte offset we're of the start of
406 * our workspace buffer
407 */
408 buf_start = total_out;
409
410 /* total_out is the last byte of the workspace buffer */
411 total_out = workspace->inf_strm.total_out;
412
413 working_bytes = total_out - buf_start;
414
415 /*
416 * start byte is the first byte of the page we're currently
417 * copying into relative to the start of the compressed data.
418 */
419 start_byte = page_offset(page_out) - disk_start;
420
421 if (working_bytes == 0) {
422 /* we didn't make progress in this inflate
423 * call, we're done
424 */
425 if (ret != Z_STREAM_END)
426 ret = -1;
427 break;
428 }
429
430 /* we haven't yet hit data corresponding to this page */
431 if (total_out <= start_byte)
432 goto next;
433
434 /*
435 * the start of the data we care about is offset into
436 * the middle of our working buffer
437 */
438 if (total_out > start_byte && buf_start < start_byte) {
439 buf_offset = start_byte - buf_start;
440 working_bytes -= buf_offset;
441 } else {
442 buf_offset = 0;
443 }
444 current_buf_start = buf_start;
445
446 /* copy bytes from the working buffer into the pages */
447 while (working_bytes > 0) {
448 bytes = min(PAGE_CACHE_SIZE - pg_offset,
449 PAGE_CACHE_SIZE - buf_offset);
450 bytes = min(bytes, working_bytes);
451 kaddr = kmap_atomic(page_out, KM_USER0);
452 memcpy(kaddr + pg_offset, workspace->buf + buf_offset,
453 bytes);
454 kunmap_atomic(kaddr, KM_USER0);
455 flush_dcache_page(page_out);
456
457 pg_offset += bytes;
458 page_bytes_left -= bytes;
459 buf_offset += bytes;
460 working_bytes -= bytes;
461 current_buf_start += bytes;
462
463 /* check if we need to pick another page */
464 if (page_bytes_left == 0) {
465 page_out_index++;
466 if (page_out_index >= vcnt) {
467 ret = 0;
468 goto done;
469 }
470
471 page_out = bvec[page_out_index].bv_page;
472 pg_offset = 0;
473 page_bytes_left = PAGE_CACHE_SIZE;
474 start_byte = page_offset(page_out) - disk_start;
475
476 /*
477 * make sure our new page is covered by this
478 * working buffer
479 */
480 if (total_out <= start_byte)
481 goto next;
482
483 /* the next page in the biovec might not
484 * be adjacent to the last page, but it
485 * might still be found inside this working
486 * buffer. bump our offset pointer
487 */
488 if (total_out > start_byte &&
489 current_buf_start < start_byte) {
490 buf_offset = start_byte - buf_start;
491 working_bytes = total_out - start_byte;
492 current_buf_start = buf_start +
493 buf_offset;
494 }
495 }
496 }
497next:
498 workspace->inf_strm.next_out = workspace->buf;
499 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
500
501 if (workspace->inf_strm.avail_in == 0) {
502 unsigned long tmp;
503 kunmap(pages_in[page_in_index]);
504 page_in_index++;
505 if (page_in_index >= total_pages_in) {
506 data_in = NULL;
507 break;
508 }
509 data_in = kmap(pages_in[page_in_index]);
510 workspace->inf_strm.next_in = data_in;
511 tmp = srclen - workspace->inf_strm.total_in;
512 workspace->inf_strm.avail_in = min(tmp,
513 PAGE_CACHE_SIZE);
514 }
515 }
516 if (ret != Z_STREAM_END)
517 ret = -1;
518 else
519 ret = 0;
520done:
521 zlib_inflateEnd(&workspace->inf_strm);
522 if (data_in)
523 kunmap(pages_in[page_in_index]);
524out:
525 free_workspace(workspace);
526 return ret;
527}
528
529/*
530 * a less complex decompression routine. Our compressed data fits in a
531 * single page, and we want to read a single page out of it.
532 * start_byte tells us the offset into the compressed data we're interested in
533 */
534int btrfs_zlib_decompress(unsigned char *data_in,
535 struct page *dest_page,
536 unsigned long start_byte,
537 size_t srclen, size_t destlen)
538{
539 int ret = 0;
540 int wbits = MAX_WBITS;
541 struct workspace *workspace;
542 unsigned long bytes_left = destlen;
543 unsigned long total_out = 0;
544 char *kaddr;
545
546 if (destlen > PAGE_CACHE_SIZE)
547 return -ENOMEM;
548
549 workspace = find_zlib_workspace();
550 if (!workspace)
551 return -ENOMEM;
552
553 workspace->inf_strm.next_in = data_in;
554 workspace->inf_strm.avail_in = srclen;
555 workspace->inf_strm.total_in = 0;
556
557 workspace->inf_strm.next_out = workspace->buf;
558 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
559 workspace->inf_strm.total_out = 0;
560 /* If it's deflate, and it's got no preset dictionary, then
561 we can tell zlib to skip the adler32 check. */
562 if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
563 ((data_in[0] & 0x0f) == Z_DEFLATED) &&
564 !(((data_in[0]<<8) + data_in[1]) % 31)) {
565
566 wbits = -((data_in[0] >> 4) + 8);
567 workspace->inf_strm.next_in += 2;
568 workspace->inf_strm.avail_in -= 2;
569 }
570
571 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
572 printk(KERN_WARNING "inflateInit failed\n");
573 ret = -1;
574 goto out;
575 }
576
577 while (bytes_left > 0) {
578 unsigned long buf_start;
579 unsigned long buf_offset;
580 unsigned long bytes;
581 unsigned long pg_offset = 0;
582
583 ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
584 if (ret != Z_OK && ret != Z_STREAM_END)
585 break;
586
587 buf_start = total_out;
588 total_out = workspace->inf_strm.total_out;
589
590 if (total_out == buf_start) {
591 ret = -1;
592 break;
593 }
594
595 if (total_out <= start_byte)
596 goto next;
597
598 if (total_out > start_byte && buf_start < start_byte)
599 buf_offset = start_byte - buf_start;
600 else
601 buf_offset = 0;
602
603 bytes = min(PAGE_CACHE_SIZE - pg_offset,
604 PAGE_CACHE_SIZE - buf_offset);
605 bytes = min(bytes, bytes_left);
606
607 kaddr = kmap_atomic(dest_page, KM_USER0);
608 memcpy(kaddr + pg_offset, workspace->buf + buf_offset, bytes);
609 kunmap_atomic(kaddr, KM_USER0);
610
611 pg_offset += bytes;
612 bytes_left -= bytes;
613next:
614 workspace->inf_strm.next_out = workspace->buf;
615 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
616 }
617
618 if (ret != Z_STREAM_END && bytes_left != 0)
619 ret = -1;
620 else
621 ret = 0;
622
623 zlib_inflateEnd(&workspace->inf_strm);
624out:
625 free_workspace(workspace);
626 return ret;
627}
628
629void btrfs_zlib_exit(void)
630{
631 free_workspaces();
632}
diff --git a/fs/buffer.c b/fs/buffer.c
index 776ae091d3b0..b58208f1640a 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -203,10 +203,25 @@ int fsync_bdev(struct block_device *bdev)
203 * happen on bdev until thaw_bdev() is called. 203 * happen on bdev until thaw_bdev() is called.
204 * If a superblock is found on this device, we take the s_umount semaphore 204 * If a superblock is found on this device, we take the s_umount semaphore
205 * on it to make sure nobody unmounts until the snapshot creation is done. 205 * on it to make sure nobody unmounts until the snapshot creation is done.
206 * The reference counter (bd_fsfreeze_count) guarantees that only the last
207 * unfreeze process can unfreeze the frozen filesystem actually when multiple
208 * freeze requests arrive simultaneously. It counts up in freeze_bdev() and
209 * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze
210 * actually.
206 */ 211 */
207struct super_block *freeze_bdev(struct block_device *bdev) 212struct super_block *freeze_bdev(struct block_device *bdev)
208{ 213{
209 struct super_block *sb; 214 struct super_block *sb;
215 int error = 0;
216
217 mutex_lock(&bdev->bd_fsfreeze_mutex);
218 if (bdev->bd_fsfreeze_count > 0) {
219 bdev->bd_fsfreeze_count++;
220 sb = get_super(bdev);
221 mutex_unlock(&bdev->bd_fsfreeze_mutex);
222 return sb;
223 }
224 bdev->bd_fsfreeze_count++;
210 225
211 down(&bdev->bd_mount_sem); 226 down(&bdev->bd_mount_sem);
212 sb = get_super(bdev); 227 sb = get_super(bdev);
@@ -221,11 +236,24 @@ struct super_block *freeze_bdev(struct block_device *bdev)
221 236
222 sync_blockdev(sb->s_bdev); 237 sync_blockdev(sb->s_bdev);
223 238
224 if (sb->s_op->write_super_lockfs) 239 if (sb->s_op->freeze_fs) {
225 sb->s_op->write_super_lockfs(sb); 240 error = sb->s_op->freeze_fs(sb);
241 if (error) {
242 printk(KERN_ERR
243 "VFS:Filesystem freeze failed\n");
244 sb->s_frozen = SB_UNFROZEN;
245 drop_super(sb);
246 up(&bdev->bd_mount_sem);
247 bdev->bd_fsfreeze_count--;
248 mutex_unlock(&bdev->bd_fsfreeze_mutex);
249 return ERR_PTR(error);
250 }
251 }
226 } 252 }
227 253
228 sync_blockdev(bdev); 254 sync_blockdev(bdev);
255 mutex_unlock(&bdev->bd_fsfreeze_mutex);
256
229 return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */ 257 return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */
230} 258}
231EXPORT_SYMBOL(freeze_bdev); 259EXPORT_SYMBOL(freeze_bdev);
@@ -237,20 +265,48 @@ EXPORT_SYMBOL(freeze_bdev);
237 * 265 *
238 * Unlocks the filesystem and marks it writeable again after freeze_bdev(). 266 * Unlocks the filesystem and marks it writeable again after freeze_bdev().
239 */ 267 */
240void thaw_bdev(struct block_device *bdev, struct super_block *sb) 268int thaw_bdev(struct block_device *bdev, struct super_block *sb)
241{ 269{
270 int error = 0;
271
272 mutex_lock(&bdev->bd_fsfreeze_mutex);
273 if (!bdev->bd_fsfreeze_count) {
274 mutex_unlock(&bdev->bd_fsfreeze_mutex);
275 return -EINVAL;
276 }
277
278 bdev->bd_fsfreeze_count--;
279 if (bdev->bd_fsfreeze_count > 0) {
280 if (sb)
281 drop_super(sb);
282 mutex_unlock(&bdev->bd_fsfreeze_mutex);
283 return 0;
284 }
285
242 if (sb) { 286 if (sb) {
243 BUG_ON(sb->s_bdev != bdev); 287 BUG_ON(sb->s_bdev != bdev);
244 288 if (!(sb->s_flags & MS_RDONLY)) {
245 if (sb->s_op->unlockfs) 289 if (sb->s_op->unfreeze_fs) {
246 sb->s_op->unlockfs(sb); 290 error = sb->s_op->unfreeze_fs(sb);
247 sb->s_frozen = SB_UNFROZEN; 291 if (error) {
248 smp_wmb(); 292 printk(KERN_ERR
249 wake_up(&sb->s_wait_unfrozen); 293 "VFS:Filesystem thaw failed\n");
294 sb->s_frozen = SB_FREEZE_TRANS;
295 bdev->bd_fsfreeze_count++;
296 mutex_unlock(&bdev->bd_fsfreeze_mutex);
297 return error;
298 }
299 }
300 sb->s_frozen = SB_UNFROZEN;
301 smp_wmb();
302 wake_up(&sb->s_wait_unfrozen);
303 }
250 drop_super(sb); 304 drop_super(sb);
251 } 305 }
252 306
253 up(&bdev->bd_mount_sem); 307 up(&bdev->bd_mount_sem);
308 mutex_unlock(&bdev->bd_fsfreeze_mutex);
309 return 0;
254} 310}
255EXPORT_SYMBOL(thaw_bdev); 311EXPORT_SYMBOL(thaw_bdev);
256 312
@@ -1996,7 +2052,7 @@ int block_write_begin(struct file *file, struct address_space *mapping,
1996 page = *pagep; 2052 page = *pagep;
1997 if (page == NULL) { 2053 if (page == NULL) {
1998 ownpage = 1; 2054 ownpage = 1;
1999 page = __grab_cache_page(mapping, index); 2055 page = grab_cache_page_write_begin(mapping, index, flags);
2000 if (!page) { 2056 if (!page) {
2001 status = -ENOMEM; 2057 status = -ENOMEM;
2002 goto out; 2058 goto out;
@@ -2022,7 +2078,6 @@ int block_write_begin(struct file *file, struct address_space *mapping,
2022 if (pos + len > inode->i_size) 2078 if (pos + len > inode->i_size)
2023 vmtruncate(inode, inode->i_size); 2079 vmtruncate(inode, inode->i_size);
2024 } 2080 }
2025 goto out;
2026 } 2081 }
2027 2082
2028out: 2083out:
@@ -2502,7 +2557,7 @@ int nobh_write_begin(struct file *file, struct address_space *mapping,
2502 from = pos & (PAGE_CACHE_SIZE - 1); 2557 from = pos & (PAGE_CACHE_SIZE - 1);
2503 to = from + len; 2558 to = from + len;
2504 2559
2505 page = __grab_cache_page(mapping, index); 2560 page = grab_cache_page_write_begin(mapping, index, flags);
2506 if (!page) 2561 if (!page)
2507 return -ENOMEM; 2562 return -ENOMEM;
2508 *pagep = page; 2563 *pagep = page;
@@ -3188,7 +3243,7 @@ void block_sync_page(struct page *page)
3188 * Use of bdflush() is deprecated and will be removed in a future kernel. 3243 * Use of bdflush() is deprecated and will be removed in a future kernel.
3189 * The `pdflush' kernel threads fully replace bdflush daemons and this call. 3244 * The `pdflush' kernel threads fully replace bdflush daemons and this call.
3190 */ 3245 */
3191asmlinkage long sys_bdflush(int func, long data) 3246SYSCALL_DEFINE2(bdflush, int, func, long, data)
3192{ 3247{
3193 static int msg_count; 3248 static int msg_count;
3194 3249
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 700697a72618..38f71222a552 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -120,7 +120,7 @@ __register_chrdev_region(unsigned int major, unsigned int baseminor,
120 cd->major = major; 120 cd->major = major;
121 cd->baseminor = baseminor; 121 cd->baseminor = baseminor;
122 cd->minorct = minorct; 122 cd->minorct = minorct;
123 strncpy(cd->name,name, 64); 123 strlcpy(cd->name, name, sizeof(cd->name));
124 124
125 i = major_to_index(major); 125 i = major_to_index(major);
126 126
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 080703a15f44..73ac7ebd1dfc 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -5,7 +5,9 @@ rather than posix (advisory) byte range locks, even though server would
5support posix byte range locks. Fix query of root inode when prefixpath 5support posix byte range locks. Fix query of root inode when prefixpath
6specified and user does not have access to query information about the 6specified and user does not have access to query information about the
7top of the share. Fix problem in 2.6.28 resolving DFS paths to 7top of the share. Fix problem in 2.6.28 resolving DFS paths to
8Samba servers (worked to Windows). 8Samba servers (worked to Windows). Fix rmdir so that pending search
9(readdir) requests do not get invalid results which include the now
10removed directory.
9 11
10Version 1.55 12Version 1.55
11------------ 13------------
diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index 6ba43fb346fb..9948c0030e86 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -5,7 +5,7 @@ obj-$(CONFIG_CIFS) += cifs.o
5 5
6cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \ 6cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \
7 link.o misc.o netmisc.o smbdes.o smbencrypt.o transport.o asn1.o \ 7 link.o misc.o netmisc.o smbdes.o smbencrypt.o transport.o asn1.o \
8 md4.o md5.o cifs_unicode.o nterr.o xattr.o cifsencrypt.o fcntl.o \ 8 md4.o md5.o cifs_unicode.o nterr.o xattr.o cifsencrypt.o \
9 readdir.o ioctl.o sess.o export.o cifsacl.o 9 readdir.o ioctl.o sess.o export.o cifsacl.o
10 10
11cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o 11cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index d4839cf0cb2c..7c9809523f42 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -48,11 +48,11 @@ static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu,
48 if ((cifs_pdu == NULL) || (signature == NULL) || (key == NULL)) 48 if ((cifs_pdu == NULL) || (signature == NULL) || (key == NULL))
49 return -EINVAL; 49 return -EINVAL;
50 50
51 MD5Init(&context); 51 cifs_MD5_init(&context);
52 MD5Update(&context, (char *)&key->data, key->len); 52 cifs_MD5_update(&context, (char *)&key->data, key->len);
53 MD5Update(&context, cifs_pdu->Protocol, cifs_pdu->smb_buf_length); 53 cifs_MD5_update(&context, cifs_pdu->Protocol, cifs_pdu->smb_buf_length);
54 54
55 MD5Final(signature, &context); 55 cifs_MD5_final(signature, &context);
56 return 0; 56 return 0;
57} 57}
58 58
@@ -96,8 +96,8 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
96 if ((iov == NULL) || (signature == NULL) || (key == NULL)) 96 if ((iov == NULL) || (signature == NULL) || (key == NULL))
97 return -EINVAL; 97 return -EINVAL;
98 98
99 MD5Init(&context); 99 cifs_MD5_init(&context);
100 MD5Update(&context, (char *)&key->data, key->len); 100 cifs_MD5_update(&context, (char *)&key->data, key->len);
101 for (i = 0; i < n_vec; i++) { 101 for (i = 0; i < n_vec; i++) {
102 if (iov[i].iov_len == 0) 102 if (iov[i].iov_len == 0)
103 continue; 103 continue;
@@ -110,13 +110,13 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
110 if (i == 0) { 110 if (i == 0) {
111 if (iov[0].iov_len <= 8) /* cmd field at offset 9 */ 111 if (iov[0].iov_len <= 8) /* cmd field at offset 9 */
112 break; /* nothing to sign or corrupt header */ 112 break; /* nothing to sign or corrupt header */
113 MD5Update(&context, iov[0].iov_base+4, 113 cifs_MD5_update(&context, iov[0].iov_base+4,
114 iov[0].iov_len-4); 114 iov[0].iov_len-4);
115 } else 115 } else
116 MD5Update(&context, iov[i].iov_base, iov[i].iov_len); 116 cifs_MD5_update(&context, iov[i].iov_base, iov[i].iov_len);
117 } 117 }
118 118
119 MD5Final(signature, &context); 119 cifs_MD5_final(signature, &context);
120 120
121 return 0; 121 return 0;
122} 122}
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 0005a194a75c..13ea53251dcf 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -747,7 +747,6 @@ const struct file_operations cifs_file_ops = {
747#endif /* CONFIG_CIFS_POSIX */ 747#endif /* CONFIG_CIFS_POSIX */
748 748
749#ifdef CONFIG_CIFS_EXPERIMENTAL 749#ifdef CONFIG_CIFS_EXPERIMENTAL
750 .dir_notify = cifs_dir_notify,
751 .setlease = cifs_setlease, 750 .setlease = cifs_setlease,
752#endif /* CONFIG_CIFS_EXPERIMENTAL */ 751#endif /* CONFIG_CIFS_EXPERIMENTAL */
753}; 752};
@@ -768,7 +767,6 @@ const struct file_operations cifs_file_direct_ops = {
768#endif /* CONFIG_CIFS_POSIX */ 767#endif /* CONFIG_CIFS_POSIX */
769 .llseek = cifs_llseek, 768 .llseek = cifs_llseek,
770#ifdef CONFIG_CIFS_EXPERIMENTAL 769#ifdef CONFIG_CIFS_EXPERIMENTAL
771 .dir_notify = cifs_dir_notify,
772 .setlease = cifs_setlease, 770 .setlease = cifs_setlease,
773#endif /* CONFIG_CIFS_EXPERIMENTAL */ 771#endif /* CONFIG_CIFS_EXPERIMENTAL */
774}; 772};
@@ -789,7 +787,6 @@ const struct file_operations cifs_file_nobrl_ops = {
789#endif /* CONFIG_CIFS_POSIX */ 787#endif /* CONFIG_CIFS_POSIX */
790 788
791#ifdef CONFIG_CIFS_EXPERIMENTAL 789#ifdef CONFIG_CIFS_EXPERIMENTAL
792 .dir_notify = cifs_dir_notify,
793 .setlease = cifs_setlease, 790 .setlease = cifs_setlease,
794#endif /* CONFIG_CIFS_EXPERIMENTAL */ 791#endif /* CONFIG_CIFS_EXPERIMENTAL */
795}; 792};
@@ -809,7 +806,6 @@ const struct file_operations cifs_file_direct_nobrl_ops = {
809#endif /* CONFIG_CIFS_POSIX */ 806#endif /* CONFIG_CIFS_POSIX */
810 .llseek = cifs_llseek, 807 .llseek = cifs_llseek,
811#ifdef CONFIG_CIFS_EXPERIMENTAL 808#ifdef CONFIG_CIFS_EXPERIMENTAL
812 .dir_notify = cifs_dir_notify,
813 .setlease = cifs_setlease, 809 .setlease = cifs_setlease,
814#endif /* CONFIG_CIFS_EXPERIMENTAL */ 810#endif /* CONFIG_CIFS_EXPERIMENTAL */
815}; 811};
@@ -818,9 +814,6 @@ const struct file_operations cifs_dir_ops = {
818 .readdir = cifs_readdir, 814 .readdir = cifs_readdir,
819 .release = cifs_closedir, 815 .release = cifs_closedir,
820 .read = generic_read_dir, 816 .read = generic_read_dir,
821#ifdef CONFIG_CIFS_EXPERIMENTAL
822 .dir_notify = cifs_dir_notify,
823#endif /* CONFIG_CIFS_EXPERIMENTAL */
824 .unlocked_ioctl = cifs_ioctl, 817 .unlocked_ioctl = cifs_ioctl,
825 .llseek = generic_file_llseek, 818 .llseek = generic_file_llseek,
826}; 819};
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 2ce04c73d74e..7ac481841f87 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -76,7 +76,6 @@ extern int cifs_file_mmap(struct file * , struct vm_area_struct *);
76extern const struct file_operations cifs_dir_ops; 76extern const struct file_operations cifs_dir_ops;
77extern int cifs_dir_open(struct inode *inode, struct file *file); 77extern int cifs_dir_open(struct inode *inode, struct file *file);
78extern int cifs_readdir(struct file *file, void *direntry, filldir_t filldir); 78extern int cifs_readdir(struct file *file, void *direntry, filldir_t filldir);
79extern int cifs_dir_notify(struct file *, unsigned long arg);
80 79
81/* Functions related to dir entries */ 80/* Functions related to dir entries */
82extern struct dentry_operations cifs_dentry_ops; 81extern struct dentry_operations cifs_dentry_ops;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 06f6779988bf..382ba6298809 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -35,8 +35,8 @@ extern struct smb_hdr *cifs_buf_get(void);
35extern void cifs_buf_release(void *); 35extern void cifs_buf_release(void *);
36extern struct smb_hdr *cifs_small_buf_get(void); 36extern struct smb_hdr *cifs_small_buf_get(void);
37extern void cifs_small_buf_release(void *); 37extern void cifs_small_buf_release(void *);
38extern int smb_send(struct socket *, struct smb_hdr *, 38extern int smb_send(struct TCP_Server_Info *, struct smb_hdr *,
39 unsigned int /* length */ , struct sockaddr *, bool); 39 unsigned int /* length */);
40extern unsigned int _GetXid(void); 40extern unsigned int _GetXid(void);
41extern void _FreeXid(unsigned int); 41extern void _FreeXid(unsigned int);
42#define GetXid() (int)_GetXid(); cFYI(1,("CIFS VFS: in %s as Xid: %d with uid: %d",__func__, xid,current_fsuid())); 42#define GetXid() (int)_GetXid(); cFYI(1,("CIFS VFS: in %s as Xid: %d with uid: %d",__func__, xid,current_fsuid()));
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index e9ea394ee075..2209be943051 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1354,7 +1354,7 @@ cifs_parse_mount_options(char *options, const char *devname,
1354} 1354}
1355 1355
1356static struct TCP_Server_Info * 1356static struct TCP_Server_Info *
1357cifs_find_tcp_session(struct sockaddr *addr) 1357cifs_find_tcp_session(struct sockaddr_storage *addr)
1358{ 1358{
1359 struct list_head *tmp; 1359 struct list_head *tmp;
1360 struct TCP_Server_Info *server; 1360 struct TCP_Server_Info *server;
@@ -1374,11 +1374,11 @@ cifs_find_tcp_session(struct sockaddr *addr)
1374 if (server->tcpStatus == CifsNew) 1374 if (server->tcpStatus == CifsNew)
1375 continue; 1375 continue;
1376 1376
1377 if (addr->sa_family == AF_INET && 1377 if (addr->ss_family == AF_INET &&
1378 (addr4->sin_addr.s_addr != 1378 (addr4->sin_addr.s_addr !=
1379 server->addr.sockAddr.sin_addr.s_addr)) 1379 server->addr.sockAddr.sin_addr.s_addr))
1380 continue; 1380 continue;
1381 else if (addr->sa_family == AF_INET6 && 1381 else if (addr->ss_family == AF_INET6 &&
1382 memcmp(&server->addr.sockAddr6.sin6_addr, 1382 memcmp(&server->addr.sockAddr6.sin6_addr,
1383 &addr6->sin6_addr, sizeof(addr6->sin6_addr))) 1383 &addr6->sin6_addr, sizeof(addr6->sin6_addr)))
1384 continue; 1384 continue;
@@ -1419,12 +1419,12 @@ static struct TCP_Server_Info *
1419cifs_get_tcp_session(struct smb_vol *volume_info) 1419cifs_get_tcp_session(struct smb_vol *volume_info)
1420{ 1420{
1421 struct TCP_Server_Info *tcp_ses = NULL; 1421 struct TCP_Server_Info *tcp_ses = NULL;
1422 struct sockaddr addr; 1422 struct sockaddr_storage addr;
1423 struct sockaddr_in *sin_server = (struct sockaddr_in *) &addr; 1423 struct sockaddr_in *sin_server = (struct sockaddr_in *) &addr;
1424 struct sockaddr_in6 *sin_server6 = (struct sockaddr_in6 *) &addr; 1424 struct sockaddr_in6 *sin_server6 = (struct sockaddr_in6 *) &addr;
1425 int rc; 1425 int rc;
1426 1426
1427 memset(&addr, 0, sizeof(struct sockaddr)); 1427 memset(&addr, 0, sizeof(struct sockaddr_storage));
1428 1428
1429 if (volume_info->UNCip && volume_info->UNC) { 1429 if (volume_info->UNCip && volume_info->UNC) {
1430 rc = cifs_inet_pton(AF_INET, volume_info->UNCip, 1430 rc = cifs_inet_pton(AF_INET, volume_info->UNCip,
@@ -1435,9 +1435,9 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
1435 rc = cifs_inet_pton(AF_INET6, volume_info->UNCip, 1435 rc = cifs_inet_pton(AF_INET6, volume_info->UNCip,
1436 &sin_server6->sin6_addr.in6_u); 1436 &sin_server6->sin6_addr.in6_u);
1437 if (rc > 0) 1437 if (rc > 0)
1438 addr.sa_family = AF_INET6; 1438 addr.ss_family = AF_INET6;
1439 } else { 1439 } else {
1440 addr.sa_family = AF_INET; 1440 addr.ss_family = AF_INET;
1441 } 1441 }
1442 1442
1443 if (rc <= 0) { 1443 if (rc <= 0) {
@@ -1502,7 +1502,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
1502 tcp_ses->tcpStatus = CifsNew; 1502 tcp_ses->tcpStatus = CifsNew;
1503 ++tcp_ses->srv_count; 1503 ++tcp_ses->srv_count;
1504 1504
1505 if (addr.sa_family == AF_INET6) { 1505 if (addr.ss_family == AF_INET6) {
1506 cFYI(1, ("attempting ipv6 connect")); 1506 cFYI(1, ("attempting ipv6 connect"));
1507 /* BB should we allow ipv6 on port 139? */ 1507 /* BB should we allow ipv6 on port 139? */
1508 /* other OS never observed in Wild doing 139 with v6 */ 1508 /* other OS never observed in Wild doing 139 with v6 */
@@ -1802,7 +1802,7 @@ ipv4_connect(struct TCP_Server_Info *server)
1802 * user space buffer 1802 * user space buffer
1803 */ 1803 */
1804 socket->sk->sk_rcvtimeo = 7 * HZ; 1804 socket->sk->sk_rcvtimeo = 7 * HZ;
1805 socket->sk->sk_sndtimeo = 3 * HZ; 1805 socket->sk->sk_sndtimeo = 5 * HZ;
1806 1806
1807 /* make the bufsizes depend on wsize/rsize and max requests */ 1807 /* make the bufsizes depend on wsize/rsize and max requests */
1808 if (server->noautotune) { 1808 if (server->noautotune) {
@@ -1860,9 +1860,7 @@ ipv4_connect(struct TCP_Server_Info *server)
1860 smb_buf = (struct smb_hdr *)ses_init_buf; 1860 smb_buf = (struct smb_hdr *)ses_init_buf;
1861 /* sizeof RFC1002_SESSION_REQUEST with no scope */ 1861 /* sizeof RFC1002_SESSION_REQUEST with no scope */
1862 smb_buf->smb_buf_length = 0x81000044; 1862 smb_buf->smb_buf_length = 0x81000044;
1863 rc = smb_send(socket, smb_buf, 0x44, 1863 rc = smb_send(server, smb_buf, 0x44);
1864 (struct sockaddr *) &server->addr.sockAddr,
1865 server->noblocksnd);
1866 kfree(ses_init_buf); 1864 kfree(ses_init_buf);
1867 msleep(1); /* RFC1001 layer in at least one server 1865 msleep(1); /* RFC1001 layer in at least one server
1868 requires very short break before negprot 1866 requires very short break before negprot
@@ -1955,7 +1953,7 @@ ipv6_connect(struct TCP_Server_Info *server)
1955 * user space buffer 1953 * user space buffer
1956 */ 1954 */
1957 socket->sk->sk_rcvtimeo = 7 * HZ; 1955 socket->sk->sk_rcvtimeo = 7 * HZ;
1958 socket->sk->sk_sndtimeo = 3 * HZ; 1956 socket->sk->sk_sndtimeo = 5 * HZ;
1959 server->ssocket = socket; 1957 server->ssocket = socket;
1960 1958
1961 return rc; 1959 return rc;
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 838d9c720a5c..964aad03c5ad 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -129,6 +129,17 @@ cifs_bp_rename_retry:
129 return full_path; 129 return full_path;
130} 130}
131 131
132static void setup_cifs_dentry(struct cifsTconInfo *tcon,
133 struct dentry *direntry,
134 struct inode *newinode)
135{
136 if (tcon->nocase)
137 direntry->d_op = &cifs_ci_dentry_ops;
138 else
139 direntry->d_op = &cifs_dentry_ops;
140 d_instantiate(direntry, newinode);
141}
142
132/* Inode operations in similar order to how they appear in Linux file fs.h */ 143/* Inode operations in similar order to how they appear in Linux file fs.h */
133 144
134int 145int
@@ -139,14 +150,14 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
139 int xid; 150 int xid;
140 int create_options = CREATE_NOT_DIR; 151 int create_options = CREATE_NOT_DIR;
141 int oplock = 0; 152 int oplock = 0;
153 /* BB below access is too much for the mknod to request */
142 int desiredAccess = GENERIC_READ | GENERIC_WRITE; 154 int desiredAccess = GENERIC_READ | GENERIC_WRITE;
143 __u16 fileHandle; 155 __u16 fileHandle;
144 struct cifs_sb_info *cifs_sb; 156 struct cifs_sb_info *cifs_sb;
145 struct cifsTconInfo *pTcon; 157 struct cifsTconInfo *tcon;
146 char *full_path = NULL; 158 char *full_path = NULL;
147 FILE_ALL_INFO *buf = NULL; 159 FILE_ALL_INFO *buf = NULL;
148 struct inode *newinode = NULL; 160 struct inode *newinode = NULL;
149 struct cifsFileInfo *pCifsFile = NULL;
150 struct cifsInodeInfo *pCifsInode; 161 struct cifsInodeInfo *pCifsInode;
151 int disposition = FILE_OVERWRITE_IF; 162 int disposition = FILE_OVERWRITE_IF;
152 bool write_only = false; 163 bool write_only = false;
@@ -154,7 +165,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
154 xid = GetXid(); 165 xid = GetXid();
155 166
156 cifs_sb = CIFS_SB(inode->i_sb); 167 cifs_sb = CIFS_SB(inode->i_sb);
157 pTcon = cifs_sb->tcon; 168 tcon = cifs_sb->tcon;
158 169
159 full_path = build_path_from_dentry(direntry); 170 full_path = build_path_from_dentry(direntry);
160 if (full_path == NULL) { 171 if (full_path == NULL) {
@@ -162,6 +173,8 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
162 return -ENOMEM; 173 return -ENOMEM;
163 } 174 }
164 175
176 mode &= ~current->fs->umask;
177
165 if (nd && (nd->flags & LOOKUP_OPEN)) { 178 if (nd && (nd->flags & LOOKUP_OPEN)) {
166 int oflags = nd->intent.open.flags; 179 int oflags = nd->intent.open.flags;
167 180
@@ -196,17 +209,15 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
196 return -ENOMEM; 209 return -ENOMEM;
197 } 210 }
198 211
199 mode &= ~current->fs->umask;
200
201 /* 212 /*
202 * if we're not using unix extensions, see if we need to set 213 * if we're not using unix extensions, see if we need to set
203 * ATTR_READONLY on the create call 214 * ATTR_READONLY on the create call
204 */ 215 */
205 if (!pTcon->unix_ext && (mode & S_IWUGO) == 0) 216 if (!tcon->unix_ext && (mode & S_IWUGO) == 0)
206 create_options |= CREATE_OPTION_READONLY; 217 create_options |= CREATE_OPTION_READONLY;
207 218
208 if (cifs_sb->tcon->ses->capabilities & CAP_NT_SMBS) 219 if (cifs_sb->tcon->ses->capabilities & CAP_NT_SMBS)
209 rc = CIFSSMBOpen(xid, pTcon, full_path, disposition, 220 rc = CIFSSMBOpen(xid, tcon, full_path, disposition,
210 desiredAccess, create_options, 221 desiredAccess, create_options,
211 &fileHandle, &oplock, buf, cifs_sb->local_nls, 222 &fileHandle, &oplock, buf, cifs_sb->local_nls,
212 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 223 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -215,7 +226,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
215 226
216 if (rc == -EIO) { 227 if (rc == -EIO) {
217 /* old server, retry the open legacy style */ 228 /* old server, retry the open legacy style */
218 rc = SMBLegacyOpen(xid, pTcon, full_path, disposition, 229 rc = SMBLegacyOpen(xid, tcon, full_path, disposition,
219 desiredAccess, create_options, 230 desiredAccess, create_options,
220 &fileHandle, &oplock, buf, cifs_sb->local_nls, 231 &fileHandle, &oplock, buf, cifs_sb->local_nls,
221 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 232 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -225,7 +236,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
225 } else { 236 } else {
226 /* If Open reported that we actually created a file 237 /* If Open reported that we actually created a file
227 then we now have to set the mode if possible */ 238 then we now have to set the mode if possible */
228 if ((pTcon->unix_ext) && (oplock & CIFS_CREATE_ACTION)) { 239 if ((tcon->unix_ext) && (oplock & CIFS_CREATE_ACTION)) {
229 struct cifs_unix_set_info_args args = { 240 struct cifs_unix_set_info_args args = {
230 .mode = mode, 241 .mode = mode,
231 .ctime = NO_CHANGE_64, 242 .ctime = NO_CHANGE_64,
@@ -244,20 +255,20 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
244 args.uid = NO_CHANGE_64; 255 args.uid = NO_CHANGE_64;
245 args.gid = NO_CHANGE_64; 256 args.gid = NO_CHANGE_64;
246 } 257 }
247 CIFSSMBUnixSetInfo(xid, pTcon, full_path, &args, 258 CIFSSMBUnixSetInfo(xid, tcon, full_path, &args,
248 cifs_sb->local_nls, 259 cifs_sb->local_nls,
249 cifs_sb->mnt_cifs_flags & 260 cifs_sb->mnt_cifs_flags &
250 CIFS_MOUNT_MAP_SPECIAL_CHR); 261 CIFS_MOUNT_MAP_SPECIAL_CHR);
251 } else { 262 } else {
252 /* BB implement mode setting via Windows security 263 /* BB implement mode setting via Windows security
253 descriptors e.g. */ 264 descriptors e.g. */
254 /* CIFSSMBWinSetPerms(xid,pTcon,path,mode,-1,-1,nls);*/ 265 /* CIFSSMBWinSetPerms(xid,tcon,path,mode,-1,-1,nls);*/
255 266
256 /* Could set r/o dos attribute if mode & 0222 == 0 */ 267 /* Could set r/o dos attribute if mode & 0222 == 0 */
257 } 268 }
258 269
259 /* server might mask mode so we have to query for it */ 270 /* server might mask mode so we have to query for it */
260 if (pTcon->unix_ext) 271 if (tcon->unix_ext)
261 rc = cifs_get_inode_info_unix(&newinode, full_path, 272 rc = cifs_get_inode_info_unix(&newinode, full_path,
262 inode->i_sb, xid); 273 inode->i_sb, xid);
263 else { 274 else {
@@ -283,22 +294,17 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
283 } 294 }
284 295
285 if (rc != 0) { 296 if (rc != 0) {
286 cFYI(1, 297 cFYI(1, ("Create worked, get_inode_info failed rc = %d",
287 ("Create worked but get_inode_info failed rc = %d", 298 rc));
288 rc)); 299 } else
289 } else { 300 setup_cifs_dentry(tcon, direntry, newinode);
290 if (pTcon->nocase) 301
291 direntry->d_op = &cifs_ci_dentry_ops;
292 else
293 direntry->d_op = &cifs_dentry_ops;
294 d_instantiate(direntry, newinode);
295 }
296 if ((nd == NULL /* nfsd case - nfs srv does not set nd */) || 302 if ((nd == NULL /* nfsd case - nfs srv does not set nd */) ||
297 (!(nd->flags & LOOKUP_OPEN))) { 303 (!(nd->flags & LOOKUP_OPEN))) {
298 /* mknod case - do not leave file open */ 304 /* mknod case - do not leave file open */
299 CIFSSMBClose(xid, pTcon, fileHandle); 305 CIFSSMBClose(xid, tcon, fileHandle);
300 } else if (newinode) { 306 } else if (newinode) {
301 pCifsFile = 307 struct cifsFileInfo *pCifsFile =
302 kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL); 308 kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
303 309
304 if (pCifsFile == NULL) 310 if (pCifsFile == NULL)
@@ -316,7 +322,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
316 /* set the following in open now 322 /* set the following in open now
317 pCifsFile->pfile = file; */ 323 pCifsFile->pfile = file; */
318 write_lock(&GlobalSMBSeslock); 324 write_lock(&GlobalSMBSeslock);
319 list_add(&pCifsFile->tlist, &pTcon->openFileList); 325 list_add(&pCifsFile->tlist, &tcon->openFileList);
320 pCifsInode = CIFS_I(newinode); 326 pCifsInode = CIFS_I(newinode);
321 if (pCifsInode) { 327 if (pCifsInode) {
322 /* if readable file instance put first in list*/ 328 /* if readable file instance put first in list*/
diff --git a/fs/cifs/fcntl.c b/fs/cifs/fcntl.c
deleted file mode 100644
index 5a57581eb4b2..000000000000
--- a/fs/cifs/fcntl.c
+++ /dev/null
@@ -1,118 +0,0 @@
1/*
2 * fs/cifs/fcntl.c
3 *
4 * vfs operations that deal with the file control API
5 *
6 * Copyright (C) International Business Machines Corp., 2003,2004
7 * Author(s): Steve French (sfrench@us.ibm.com)
8 *
9 * This library is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU Lesser General Public License as published
11 * by the Free Software Foundation; either version 2.1 of the License, or
12 * (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
17 * the GNU Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public License
20 * along with this library; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */
23#include <linux/fs.h>
24#include <linux/stat.h>
25#include <linux/fcntl.h>
26#include "cifsglob.h"
27#include "cifsproto.h"
28#include "cifs_unicode.h"
29#include "cifs_debug.h"
30#include "cifsfs.h"
31
32static __u32 convert_to_cifs_notify_flags(unsigned long fcntl_notify_flags)
33{
34 __u32 cifs_ntfy_flags = 0;
35
36 /* No way on Linux VFS to ask to monitor xattr
37 changes (and no stream support either */
38 if (fcntl_notify_flags & DN_ACCESS)
39 cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_LAST_ACCESS;
40 if (fcntl_notify_flags & DN_MODIFY) {
41 /* What does this mean on directories? */
42 cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_LAST_WRITE |
43 FILE_NOTIFY_CHANGE_SIZE;
44 }
45 if (fcntl_notify_flags & DN_CREATE) {
46 cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_CREATION |
47 FILE_NOTIFY_CHANGE_LAST_WRITE;
48 }
49 if (fcntl_notify_flags & DN_DELETE)
50 cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_LAST_WRITE;
51 if (fcntl_notify_flags & DN_RENAME) {
52 /* BB review this - checking various server behaviors */
53 cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_DIR_NAME |
54 FILE_NOTIFY_CHANGE_FILE_NAME;
55 }
56 if (fcntl_notify_flags & DN_ATTRIB) {
57 cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_SECURITY |
58 FILE_NOTIFY_CHANGE_ATTRIBUTES;
59 }
60/* if (fcntl_notify_flags & DN_MULTISHOT) {
61 cifs_ntfy_flags |= ;
62 } */ /* BB fixme - not sure how to handle this with CIFS yet */
63
64 return cifs_ntfy_flags;
65}
66
67int cifs_dir_notify(struct file *file, unsigned long arg)
68{
69 int xid;
70 int rc = -EINVAL;
71 int oplock = 0;
72 struct cifs_sb_info *cifs_sb;
73 struct cifsTconInfo *pTcon;
74 char *full_path = NULL;
75 __u32 filter = FILE_NOTIFY_CHANGE_NAME | FILE_NOTIFY_CHANGE_ATTRIBUTES;
76 __u16 netfid;
77
78 if (experimEnabled == 0)
79 return 0;
80
81 xid = GetXid();
82 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
83 pTcon = cifs_sb->tcon;
84
85 full_path = build_path_from_dentry(file->f_path.dentry);
86
87 if (full_path == NULL) {
88 rc = -ENOMEM;
89 } else {
90 cFYI(1, ("dir notify on file %s Arg 0x%lx", full_path, arg));
91 rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN,
92 GENERIC_READ | SYNCHRONIZE, 0 /* create options */,
93 &netfid, &oplock, NULL, cifs_sb->local_nls,
94 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
95 /* BB fixme - add this handle to a notify handle list */
96 if (rc) {
97 cFYI(1, ("Could not open directory for notify"));
98 } else {
99 filter = convert_to_cifs_notify_flags(arg);
100 if (filter != 0) {
101 rc = CIFSSMBNotify(xid, pTcon,
102 0 /* no subdirs */, netfid,
103 filter, file, arg & DN_MULTISHOT,
104 cifs_sb->local_nls);
105 } else {
106 rc = -EINVAL;
107 }
108 /* BB add code to close file eventually (at unmount
109 it would close automatically but may be a way
110 to do it easily when inode freed or when
111 notify info is cleared/changed */
112 cFYI(1, ("notify rc %d", rc));
113 }
114 }
115
116 FreeXid(xid);
117 return rc;
118}
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index b1e1fc6a6e6a..12bb656fbe75 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2074,7 +2074,7 @@ static int cifs_write_begin(struct file *file, struct address_space *mapping,
2074 2074
2075 cFYI(1, ("write_begin from %lld len %d", (long long)pos, len)); 2075 cFYI(1, ("write_begin from %lld len %d", (long long)pos, len));
2076 2076
2077 page = __grab_cache_page(mapping, index); 2077 page = grab_cache_page_write_begin(mapping, index, flags);
2078 if (!page) { 2078 if (!page) {
2079 rc = -ENOMEM; 2079 rc = -ENOMEM;
2080 goto out; 2080 goto out;
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index f247da9f4edc..bcf7b5184664 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1285,6 +1285,11 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
1285 cifsInode = CIFS_I(direntry->d_inode); 1285 cifsInode = CIFS_I(direntry->d_inode);
1286 cifsInode->time = 0; /* force revalidate to go get info when 1286 cifsInode->time = 0; /* force revalidate to go get info when
1287 needed */ 1287 needed */
1288
1289 cifsInode = CIFS_I(inode);
1290 cifsInode->time = 0; /* force revalidate to get parent dir info
1291 since cached search results now invalid */
1292
1288 direntry->d_inode->i_ctime = inode->i_ctime = inode->i_mtime = 1293 direntry->d_inode->i_ctime = inode->i_ctime = inode->i_mtime =
1289 current_fs_time(inode->i_sb); 1294 current_fs_time(inode->i_sb);
1290 1295
@@ -1641,7 +1646,7 @@ do_expand:
1641 i_size_write(inode, offset); 1646 i_size_write(inode, offset);
1642 spin_unlock(&inode->i_lock); 1647 spin_unlock(&inode->i_lock);
1643out_truncate: 1648out_truncate:
1644 if (inode->i_op && inode->i_op->truncate) 1649 if (inode->i_op->truncate)
1645 inode->i_op->truncate(inode); 1650 inode->i_op->truncate(inode);
1646 return 0; 1651 return 0;
1647out_sig: 1652out_sig:
diff --git a/fs/cifs/md5.c b/fs/cifs/md5.c
index 462bbfefd4b6..98b66a54c319 100644
--- a/fs/cifs/md5.c
+++ b/fs/cifs/md5.c
@@ -10,8 +10,8 @@
10 * with every copy. 10 * with every copy.
11 * 11 *
12 * To compute the message digest of a chunk of bytes, declare an 12 * To compute the message digest of a chunk of bytes, declare an
13 * MD5Context structure, pass it to MD5Init, call MD5Update as 13 * MD5Context structure, pass it to cifs_MD5_init, call cifs_MD5_update as
14 * needed on buffers full of bytes, and then call MD5Final, which 14 * needed on buffers full of bytes, and then call cifs_MD5_final, which
15 * will fill a supplied 16-byte array with the digest. 15 * will fill a supplied 16-byte array with the digest.
16 */ 16 */
17 17
@@ -45,7 +45,7 @@ byteReverse(unsigned char *buf, unsigned longs)
45 * initialization constants. 45 * initialization constants.
46 */ 46 */
47void 47void
48MD5Init(struct MD5Context *ctx) 48cifs_MD5_init(struct MD5Context *ctx)
49{ 49{
50 ctx->buf[0] = 0x67452301; 50 ctx->buf[0] = 0x67452301;
51 ctx->buf[1] = 0xefcdab89; 51 ctx->buf[1] = 0xefcdab89;
@@ -61,7 +61,7 @@ MD5Init(struct MD5Context *ctx)
61 * of bytes. 61 * of bytes.
62 */ 62 */
63void 63void
64MD5Update(struct MD5Context *ctx, unsigned char const *buf, unsigned len) 64cifs_MD5_update(struct MD5Context *ctx, unsigned char const *buf, unsigned len)
65{ 65{
66 register __u32 t; 66 register __u32 t;
67 67
@@ -110,7 +110,7 @@ MD5Update(struct MD5Context *ctx, unsigned char const *buf, unsigned len)
110 * 1 0* (64-bit count of bits processed, MSB-first) 110 * 1 0* (64-bit count of bits processed, MSB-first)
111 */ 111 */
112void 112void
113MD5Final(unsigned char digest[16], struct MD5Context *ctx) 113cifs_MD5_final(unsigned char digest[16], struct MD5Context *ctx)
114{ 114{
115 unsigned int count; 115 unsigned int count;
116 unsigned char *p; 116 unsigned char *p;
@@ -165,7 +165,7 @@ MD5Final(unsigned char digest[16], struct MD5Context *ctx)
165 165
166/* 166/*
167 * The core of the MD5 algorithm, this alters an existing MD5 hash to 167 * The core of the MD5 algorithm, this alters an existing MD5 hash to
168 * reflect the addition of 16 longwords of new data. MD5Update blocks 168 * reflect the addition of 16 longwords of new data. cifs_MD5_update blocks
169 * the data and converts bytes into longwords for this routine. 169 * the data and converts bytes into longwords for this routine.
170 */ 170 */
171static void 171static void
@@ -267,9 +267,9 @@ hmac_md5_init_rfc2104(unsigned char *key, int key_len,
267 unsigned char tk[16]; 267 unsigned char tk[16];
268 struct MD5Context tctx; 268 struct MD5Context tctx;
269 269
270 MD5Init(&tctx); 270 cifs_MD5_init(&tctx);
271 MD5Update(&tctx, key, key_len); 271 cifs_MD5_update(&tctx, key, key_len);
272 MD5Final(tk, &tctx); 272 cifs_MD5_final(tk, &tctx);
273 273
274 key = tk; 274 key = tk;
275 key_len = 16; 275 key_len = 16;
@@ -287,8 +287,8 @@ hmac_md5_init_rfc2104(unsigned char *key, int key_len,
287 ctx->k_opad[i] ^= 0x5c; 287 ctx->k_opad[i] ^= 0x5c;
288 } 288 }
289 289
290 MD5Init(&ctx->ctx); 290 cifs_MD5_init(&ctx->ctx);
291 MD5Update(&ctx->ctx, ctx->k_ipad, 64); 291 cifs_MD5_update(&ctx->ctx, ctx->k_ipad, 64);
292} 292}
293#endif 293#endif
294 294
@@ -317,8 +317,8 @@ hmac_md5_init_limK_to_64(const unsigned char *key, int key_len,
317 ctx->k_opad[i] ^= 0x5c; 317 ctx->k_opad[i] ^= 0x5c;
318 } 318 }
319 319
320 MD5Init(&ctx->ctx); 320 cifs_MD5_init(&ctx->ctx);
321 MD5Update(&ctx->ctx, ctx->k_ipad, 64); 321 cifs_MD5_update(&ctx->ctx, ctx->k_ipad, 64);
322} 322}
323 323
324/*********************************************************************** 324/***********************************************************************
@@ -328,7 +328,7 @@ void
328hmac_md5_update(const unsigned char *text, int text_len, 328hmac_md5_update(const unsigned char *text, int text_len,
329 struct HMACMD5Context *ctx) 329 struct HMACMD5Context *ctx)
330{ 330{
331 MD5Update(&ctx->ctx, text, text_len); /* then text of datagram */ 331 cifs_MD5_update(&ctx->ctx, text, text_len); /* then text of datagram */
332} 332}
333 333
334/*********************************************************************** 334/***********************************************************************
@@ -339,12 +339,12 @@ hmac_md5_final(unsigned char *digest, struct HMACMD5Context *ctx)
339{ 339{
340 struct MD5Context ctx_o; 340 struct MD5Context ctx_o;
341 341
342 MD5Final(digest, &ctx->ctx); 342 cifs_MD5_final(digest, &ctx->ctx);
343 343
344 MD5Init(&ctx_o); 344 cifs_MD5_init(&ctx_o);
345 MD5Update(&ctx_o, ctx->k_opad, 64); 345 cifs_MD5_update(&ctx_o, ctx->k_opad, 64);
346 MD5Update(&ctx_o, digest, 16); 346 cifs_MD5_update(&ctx_o, digest, 16);
347 MD5Final(digest, &ctx_o); 347 cifs_MD5_final(digest, &ctx_o);
348} 348}
349 349
350/*********************************************************** 350/***********************************************************
diff --git a/fs/cifs/md5.h b/fs/cifs/md5.h
index f7d4f4197bac..6fba8cb402fd 100644
--- a/fs/cifs/md5.h
+++ b/fs/cifs/md5.h
@@ -20,10 +20,10 @@ struct HMACMD5Context {
20}; 20};
21#endif /* _HMAC_MD5_H */ 21#endif /* _HMAC_MD5_H */
22 22
23void MD5Init(struct MD5Context *context); 23void cifs_MD5_init(struct MD5Context *context);
24void MD5Update(struct MD5Context *context, unsigned char const *buf, 24void cifs_MD5_update(struct MD5Context *context, unsigned char const *buf,
25 unsigned len); 25 unsigned len);
26void MD5Final(unsigned char digest[16], struct MD5Context *context); 26void cifs_MD5_final(unsigned char digest[16], struct MD5Context *context);
27 27
28/* The following definitions come from lib/hmacmd5.c */ 28/* The following definitions come from lib/hmacmd5.c */
29 29
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 7ebe6599ed3a..0ad3e2d116a6 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -154,81 +154,8 @@ void DeleteTconOplockQEntries(struct cifsTconInfo *tcon)
154 spin_unlock(&GlobalMid_Lock); 154 spin_unlock(&GlobalMid_Lock);
155} 155}
156 156
157int
158smb_send(struct socket *ssocket, struct smb_hdr *smb_buffer,
159 unsigned int smb_buf_length, struct sockaddr *sin, bool noblocksnd)
160{
161 int rc = 0;
162 int i = 0;
163 struct msghdr smb_msg;
164 struct kvec iov;
165 unsigned len = smb_buf_length + 4;
166
167 if (ssocket == NULL)
168 return -ENOTSOCK; /* BB eventually add reconnect code here */
169 iov.iov_base = smb_buffer;
170 iov.iov_len = len;
171
172 smb_msg.msg_name = sin;
173 smb_msg.msg_namelen = sizeof(struct sockaddr);
174 smb_msg.msg_control = NULL;
175 smb_msg.msg_controllen = 0;
176 if (noblocksnd)
177 smb_msg.msg_flags = MSG_DONTWAIT + MSG_NOSIGNAL;
178 else
179 smb_msg.msg_flags = MSG_NOSIGNAL;
180
181 /* smb header is converted in header_assemble. bcc and rest of SMB word
182 area, and byte area if necessary, is converted to littleendian in
183 cifssmb.c and RFC1001 len is converted to bigendian in smb_send
184 Flags2 is converted in SendReceive */
185
186 smb_buffer->smb_buf_length = cpu_to_be32(smb_buffer->smb_buf_length);
187 cFYI(1, ("Sending smb of length %d", smb_buf_length));
188 dump_smb(smb_buffer, len);
189
190 while (len > 0) {
191 rc = kernel_sendmsg(ssocket, &smb_msg, &iov, 1, len);
192 if ((rc == -ENOSPC) || (rc == -EAGAIN)) {
193 i++;
194 /* smaller timeout here than send2 since smaller size */
195 /* Although it may not be required, this also is smaller
196 oplock break time */
197 if (i > 12) {
198 cERROR(1,
199 ("sends on sock %p stuck for 7 seconds",
200 ssocket));
201 rc = -EAGAIN;
202 break;
203 }
204 msleep(1 << i);
205 continue;
206 }
207 if (rc < 0)
208 break;
209 else
210 i = 0; /* reset i after each successful send */
211 iov.iov_base += rc;
212 iov.iov_len -= rc;
213 len -= rc;
214 }
215
216 if (rc < 0) {
217 cERROR(1, ("Error %d sending data on socket to server", rc));
218 } else {
219 rc = 0;
220 }
221
222 /* Don't want to modify the buffer as a
223 side effect of this call. */
224 smb_buffer->smb_buf_length = smb_buf_length;
225
226 return rc;
227}
228
229static int 157static int
230smb_send2(struct TCP_Server_Info *server, struct kvec *iov, int n_vec, 158smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
231 struct sockaddr *sin, bool noblocksnd)
232{ 159{
233 int rc = 0; 160 int rc = 0;
234 int i = 0; 161 int i = 0;
@@ -243,11 +170,11 @@ smb_send2(struct TCP_Server_Info *server, struct kvec *iov, int n_vec,
243 if (ssocket == NULL) 170 if (ssocket == NULL)
244 return -ENOTSOCK; /* BB eventually add reconnect code here */ 171 return -ENOTSOCK; /* BB eventually add reconnect code here */
245 172
246 smb_msg.msg_name = sin; 173 smb_msg.msg_name = (struct sockaddr *) &server->addr.sockAddr;
247 smb_msg.msg_namelen = sizeof(struct sockaddr); 174 smb_msg.msg_namelen = sizeof(struct sockaddr);
248 smb_msg.msg_control = NULL; 175 smb_msg.msg_control = NULL;
249 smb_msg.msg_controllen = 0; 176 smb_msg.msg_controllen = 0;
250 if (noblocksnd) 177 if (server->noblocksnd)
251 smb_msg.msg_flags = MSG_DONTWAIT + MSG_NOSIGNAL; 178 smb_msg.msg_flags = MSG_DONTWAIT + MSG_NOSIGNAL;
252 else 179 else
253 smb_msg.msg_flags = MSG_NOSIGNAL; 180 smb_msg.msg_flags = MSG_NOSIGNAL;
@@ -272,7 +199,25 @@ smb_send2(struct TCP_Server_Info *server, struct kvec *iov, int n_vec,
272 n_vec - first_vec, total_len); 199 n_vec - first_vec, total_len);
273 if ((rc == -ENOSPC) || (rc == -EAGAIN)) { 200 if ((rc == -ENOSPC) || (rc == -EAGAIN)) {
274 i++; 201 i++;
275 if (i >= 14) { 202 /* if blocking send we try 3 times, since each can block
203 for 5 seconds. For nonblocking we have to try more
204 but wait increasing amounts of time allowing time for
205 socket to clear. The overall time we wait in either
206 case to send on the socket is about 15 seconds.
207 Similarly we wait for 15 seconds for
208 a response from the server in SendReceive[2]
209 for the server to send a response back for
210 most types of requests (except SMB Write
211 past end of file which can be slow, and
212 blocking lock operations). NFS waits slightly longer
213 than CIFS, but this can make it take longer for
214 nonresponsive servers to be detected and 15 seconds
215 is more than enough time for modern networks to
216 send a packet. In most cases if we fail to send
217 after the retries we will kill the socket and
218 reconnect which may clear the network problem.
219 */
220 if ((i >= 14) || (!server->noblocksnd && (i > 2))) {
276 cERROR(1, 221 cERROR(1,
277 ("sends on sock %p stuck for 15 seconds", 222 ("sends on sock %p stuck for 15 seconds",
278 ssocket)); 223 ssocket));
@@ -339,6 +284,18 @@ smb_send2(struct TCP_Server_Info *server, struct kvec *iov, int n_vec,
339 return rc; 284 return rc;
340} 285}
341 286
287int
288smb_send(struct TCP_Server_Info *server, struct smb_hdr *smb_buffer,
289 unsigned int smb_buf_length)
290{
291 struct kvec iov;
292
293 iov.iov_base = smb_buffer;
294 iov.iov_len = smb_buf_length + 4;
295
296 return smb_sendv(server, &iov, 1);
297}
298
342static int wait_for_free_request(struct cifsSesInfo *ses, const int long_op) 299static int wait_for_free_request(struct cifsSesInfo *ses, const int long_op)
343{ 300{
344 if (long_op == CIFS_ASYNC_OP) { 301 if (long_op == CIFS_ASYNC_OP) {
@@ -540,9 +497,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
540#ifdef CONFIG_CIFS_STATS2 497#ifdef CONFIG_CIFS_STATS2
541 atomic_inc(&ses->server->inSend); 498 atomic_inc(&ses->server->inSend);
542#endif 499#endif
543 rc = smb_send2(ses->server, iov, n_vec, 500 rc = smb_sendv(ses->server, iov, n_vec);
544 (struct sockaddr *) &(ses->server->addr.sockAddr),
545 ses->server->noblocksnd);
546#ifdef CONFIG_CIFS_STATS2 501#ifdef CONFIG_CIFS_STATS2
547 atomic_dec(&ses->server->inSend); 502 atomic_dec(&ses->server->inSend);
548 midQ->when_sent = jiffies; 503 midQ->when_sent = jiffies;
@@ -736,9 +691,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
736#ifdef CONFIG_CIFS_STATS2 691#ifdef CONFIG_CIFS_STATS2
737 atomic_inc(&ses->server->inSend); 692 atomic_inc(&ses->server->inSend);
738#endif 693#endif
739 rc = smb_send(ses->server->ssocket, in_buf, in_buf->smb_buf_length, 694 rc = smb_send(ses->server, in_buf, in_buf->smb_buf_length);
740 (struct sockaddr *) &(ses->server->addr.sockAddr),
741 ses->server->noblocksnd);
742#ifdef CONFIG_CIFS_STATS2 695#ifdef CONFIG_CIFS_STATS2
743 atomic_dec(&ses->server->inSend); 696 atomic_dec(&ses->server->inSend);
744 midQ->when_sent = jiffies; 697 midQ->when_sent = jiffies;
@@ -879,9 +832,7 @@ send_nt_cancel(struct cifsTconInfo *tcon, struct smb_hdr *in_buf,
879 mutex_unlock(&ses->server->srv_mutex); 832 mutex_unlock(&ses->server->srv_mutex);
880 return rc; 833 return rc;
881 } 834 }
882 rc = smb_send(ses->server->ssocket, in_buf, in_buf->smb_buf_length, 835 rc = smb_send(ses->server, in_buf, in_buf->smb_buf_length);
883 (struct sockaddr *) &(ses->server->addr.sockAddr),
884 ses->server->noblocksnd);
885 mutex_unlock(&ses->server->srv_mutex); 836 mutex_unlock(&ses->server->srv_mutex);
886 return rc; 837 return rc;
887} 838}
@@ -973,9 +924,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
973#ifdef CONFIG_CIFS_STATS2 924#ifdef CONFIG_CIFS_STATS2
974 atomic_inc(&ses->server->inSend); 925 atomic_inc(&ses->server->inSend);
975#endif 926#endif
976 rc = smb_send(ses->server->ssocket, in_buf, in_buf->smb_buf_length, 927 rc = smb_send(ses->server, in_buf, in_buf->smb_buf_length);
977 (struct sockaddr *) &(ses->server->addr.sockAddr),
978 ses->server->noblocksnd);
979#ifdef CONFIG_CIFS_STATS2 928#ifdef CONFIG_CIFS_STATS2
980 atomic_dec(&ses->server->inSend); 929 atomic_dec(&ses->server->inSend);
981 midQ->when_sent = jiffies; 930 midQ->when_sent = jiffies;
diff --git a/fs/coda/Kconfig b/fs/coda/Kconfig
new file mode 100644
index 000000000000..c0e5a7fad06d
--- /dev/null
+++ b/fs/coda/Kconfig
@@ -0,0 +1,21 @@
1config CODA_FS
2 tristate "Coda file system support (advanced network fs)"
3 depends on INET
4 help
5 Coda is an advanced network file system, similar to NFS in that it
6 enables you to mount file systems of a remote server and access them
7 with regular Unix commands as if they were sitting on your hard
8 disk. Coda has several advantages over NFS: support for
9 disconnected operation (e.g. for laptops), read/write server
10 replication, security model for authentication and encryption,
11 persistent client caches and write back caching.
12
13 If you say Y here, your Linux box will be able to act as a Coda
14 *client*. You will need user level code as well, both for the
15 client and server. Servers are currently user level, i.e. they need
16 no kernel support. Please read
17 <file:Documentation/filesystems/coda.txt> and check out the Coda
18 home page <http://www.coda.cs.cmu.edu/>.
19
20 To compile the coda client support as a module, choose M here: the
21 module will be called coda.
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 466303db2df6..6a347fbc998a 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -201,8 +201,7 @@ int coda_release(struct inode *coda_inode, struct file *coda_file)
201int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, int datasync) 201int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, int datasync)
202{ 202{
203 struct file *host_file; 203 struct file *host_file;
204 struct dentry *host_dentry; 204 struct inode *coda_inode = coda_dentry->d_inode;
205 struct inode *host_inode, *coda_inode = coda_dentry->d_inode;
206 struct coda_file_info *cfi; 205 struct coda_file_info *cfi;
207 int err = 0; 206 int err = 0;
208 207
@@ -214,14 +213,7 @@ int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, int datasync)
214 BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); 213 BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
215 host_file = cfi->cfi_container; 214 host_file = cfi->cfi_container;
216 215
217 if (host_file->f_op && host_file->f_op->fsync) { 216 err = vfs_fsync(host_file, host_file->f_path.dentry, datasync);
218 host_dentry = host_file->f_path.dentry;
219 host_inode = host_dentry->d_inode;
220 mutex_lock(&host_inode->i_mutex);
221 err = host_file->f_op->fsync(host_file, host_dentry, datasync);
222 mutex_unlock(&host_inode->i_mutex);
223 }
224
225 if ( !err && !datasync ) { 217 if ( !err && !datasync ) {
226 lock_kernel(); 218 lock_kernel();
227 err = venus_fsync(coda_inode->i_sb, coda_i2f(coda_inode)); 219 err = venus_fsync(coda_inode->i_sb, coda_i2f(coda_inode));
diff --git a/fs/coda/sysctl.c b/fs/coda/sysctl.c
index 81b7771c6465..43c96ce29614 100644
--- a/fs/coda/sysctl.c
+++ b/fs/coda/sysctl.c
@@ -11,7 +11,9 @@
11 11
12#include "coda_int.h" 12#include "coda_int.h"
13 13
14#ifdef CONFIG_SYSCTL
14static struct ctl_table_header *fs_table_header; 15static struct ctl_table_header *fs_table_header;
16#endif
15 17
16static ctl_table coda_table[] = { 18static ctl_table coda_table[] = {
17 { 19 {
@@ -41,6 +43,7 @@ static ctl_table coda_table[] = {
41 {} 43 {}
42}; 44};
43 45
46#ifdef CONFIG_SYSCTL
44static ctl_table fs_table[] = { 47static ctl_table fs_table[] = {
45 { 48 {
46 .ctl_name = CTL_UNNUMBERED, 49 .ctl_name = CTL_UNNUMBERED,
@@ -50,7 +53,7 @@ static ctl_table fs_table[] = {
50 }, 53 },
51 {} 54 {}
52}; 55};
53 56#endif
54 57
55void coda_sysctl_init(void) 58void coda_sysctl_init(void)
56{ 59{
diff --git a/fs/compat.c b/fs/compat.c
index d1ece79b6411..65a070e705ab 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1187,6 +1187,9 @@ compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec, unsign
1187 ret = compat_do_readv_writev(READ, file, vec, vlen, &file->f_pos); 1187 ret = compat_do_readv_writev(READ, file, vec, vlen, &file->f_pos);
1188 1188
1189out: 1189out:
1190 if (ret > 0)
1191 add_rchar(current, ret);
1192 inc_syscr(current);
1190 fput(file); 1193 fput(file);
1191 return ret; 1194 return ret;
1192} 1195}
@@ -1210,6 +1213,9 @@ compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec, unsig
1210 ret = compat_do_readv_writev(WRITE, file, vec, vlen, &file->f_pos); 1213 ret = compat_do_readv_writev(WRITE, file, vec, vlen, &file->f_pos);
1211 1214
1212out: 1215out:
1216 if (ret > 0)
1217 add_wchar(current, ret);
1218 inc_syscw(current);
1213 fput(file); 1219 fput(file);
1214 return ret; 1220 return ret;
1215} 1221}
@@ -1703,7 +1709,7 @@ asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp,
1703} 1709}
1704 1710
1705#ifdef HAVE_SET_RESTORE_SIGMASK 1711#ifdef HAVE_SET_RESTORE_SIGMASK
1706asmlinkage long compat_sys_pselect7(int n, compat_ulong_t __user *inp, 1712static long do_compat_pselect(int n, compat_ulong_t __user *inp,
1707 compat_ulong_t __user *outp, compat_ulong_t __user *exp, 1713 compat_ulong_t __user *outp, compat_ulong_t __user *exp,
1708 struct compat_timespec __user *tsp, compat_sigset_t __user *sigmask, 1714 struct compat_timespec __user *tsp, compat_sigset_t __user *sigmask,
1709 compat_size_t sigsetsize) 1715 compat_size_t sigsetsize)
@@ -1769,8 +1775,8 @@ asmlinkage long compat_sys_pselect6(int n, compat_ulong_t __user *inp,
1769 (compat_size_t __user *)(sig+sizeof(up)))) 1775 (compat_size_t __user *)(sig+sizeof(up))))
1770 return -EFAULT; 1776 return -EFAULT;
1771 } 1777 }
1772 return compat_sys_pselect7(n, inp, outp, exp, tsp, compat_ptr(up), 1778 return do_compat_pselect(n, inp, outp, exp, tsp, compat_ptr(up),
1773 sigsetsize); 1779 sigsetsize);
1774} 1780}
1775 1781
1776asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds, 1782asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds,
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 5235c67e7594..c8f8d5904f5e 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -538,6 +538,7 @@ static int dev_ifsioc(unsigned int fd, unsigned int cmd, unsigned long arg)
538 * cannot be fixed without breaking all existing apps. 538 * cannot be fixed without breaking all existing apps.
539 */ 539 */
540 case TUNSETIFF: 540 case TUNSETIFF:
541 case TUNGETIFF:
541 case SIOCGIFFLAGS: 542 case SIOCGIFFLAGS:
542 case SIOCGIFMETRIC: 543 case SIOCGIFMETRIC:
543 case SIOCGIFMTU: 544 case SIOCGIFMTU:
@@ -1982,6 +1983,11 @@ COMPATIBLE_IOCTL(TUNSETNOCSUM)
1982COMPATIBLE_IOCTL(TUNSETDEBUG) 1983COMPATIBLE_IOCTL(TUNSETDEBUG)
1983COMPATIBLE_IOCTL(TUNSETPERSIST) 1984COMPATIBLE_IOCTL(TUNSETPERSIST)
1984COMPATIBLE_IOCTL(TUNSETOWNER) 1985COMPATIBLE_IOCTL(TUNSETOWNER)
1986COMPATIBLE_IOCTL(TUNSETLINK)
1987COMPATIBLE_IOCTL(TUNSETGROUP)
1988COMPATIBLE_IOCTL(TUNGETFEATURES)
1989COMPATIBLE_IOCTL(TUNSETOFFLOAD)
1990COMPATIBLE_IOCTL(TUNSETTXFILTER)
1985/* Big V */ 1991/* Big V */
1986COMPATIBLE_IOCTL(VT_SETMODE) 1992COMPATIBLE_IOCTL(VT_SETMODE)
1987COMPATIBLE_IOCTL(VT_GETMODE) 1993COMPATIBLE_IOCTL(VT_GETMODE)
@@ -2573,6 +2579,7 @@ HANDLE_IOCTL(SIOCGIFPFLAGS, dev_ifsioc)
2573HANDLE_IOCTL(SIOCGIFTXQLEN, dev_ifsioc) 2579HANDLE_IOCTL(SIOCGIFTXQLEN, dev_ifsioc)
2574HANDLE_IOCTL(SIOCSIFTXQLEN, dev_ifsioc) 2580HANDLE_IOCTL(SIOCSIFTXQLEN, dev_ifsioc)
2575HANDLE_IOCTL(TUNSETIFF, dev_ifsioc) 2581HANDLE_IOCTL(TUNSETIFF, dev_ifsioc)
2582HANDLE_IOCTL(TUNGETIFF, dev_ifsioc)
2576HANDLE_IOCTL(SIOCETHTOOL, ethtool_ioctl) 2583HANDLE_IOCTL(SIOCETHTOOL, ethtool_ioctl)
2577HANDLE_IOCTL(SIOCBONDENSLAVE, bond_ioctl) 2584HANDLE_IOCTL(SIOCBONDENSLAVE, bond_ioctl)
2578HANDLE_IOCTL(SIOCBONDRELEASE, bond_ioctl) 2585HANDLE_IOCTL(SIOCBONDRELEASE, bond_ioctl)
diff --git a/fs/configfs/Kconfig b/fs/configfs/Kconfig
new file mode 100644
index 000000000000..13587cc97a0b
--- /dev/null
+++ b/fs/configfs/Kconfig
@@ -0,0 +1,11 @@
1config CONFIGFS_FS
2 tristate "Userspace-driven configuration filesystem"
3 depends on SYSFS
4 help
5 configfs is a ram-based filesystem that provides the converse
6 of sysfs's functionality. Where sysfs is a filesystem-based
7 view of kernel objects, configfs is a filesystem-based manager
8 of kernel objects, or config_items.
9
10 Both sysfs and configfs can and should exist together on the
11 same system. One is not a replacement for the other.
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 8e93341f3e82..9c2358391147 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -553,12 +553,24 @@ static void detach_groups(struct config_group *group)
553 553
554 child = sd->s_dentry; 554 child = sd->s_dentry;
555 555
556 /*
557 * Note: we hide this from lockdep since we have no way
558 * to teach lockdep about recursive
559 * I_MUTEX_PARENT -> I_MUTEX_CHILD patterns along a path
560 * in an inode tree, which are valid as soon as
561 * I_MUTEX_PARENT -> I_MUTEX_CHILD is valid from a
562 * parent inode to one of its children.
563 */
564 lockdep_off();
556 mutex_lock(&child->d_inode->i_mutex); 565 mutex_lock(&child->d_inode->i_mutex);
566 lockdep_on();
557 567
558 configfs_detach_group(sd->s_element); 568 configfs_detach_group(sd->s_element);
559 child->d_inode->i_flags |= S_DEAD; 569 child->d_inode->i_flags |= S_DEAD;
560 570
571 lockdep_off();
561 mutex_unlock(&child->d_inode->i_mutex); 572 mutex_unlock(&child->d_inode->i_mutex);
573 lockdep_on();
562 574
563 d_delete(child); 575 d_delete(child);
564 dput(child); 576 dput(child);
@@ -748,11 +760,22 @@ static int configfs_attach_item(struct config_item *parent_item,
748 * We are going to remove an inode and its dentry but 760 * We are going to remove an inode and its dentry but
749 * the VFS may already have hit and used them. Thus, 761 * the VFS may already have hit and used them. Thus,
750 * we must lock them as rmdir() would. 762 * we must lock them as rmdir() would.
763 *
764 * Note: we hide this from lockdep since we have no way
765 * to teach lockdep about recursive
766 * I_MUTEX_PARENT -> I_MUTEX_CHILD patterns along a path
767 * in an inode tree, which are valid as soon as
768 * I_MUTEX_PARENT -> I_MUTEX_CHILD is valid from a
769 * parent inode to one of its children.
751 */ 770 */
771 lockdep_off();
752 mutex_lock(&dentry->d_inode->i_mutex); 772 mutex_lock(&dentry->d_inode->i_mutex);
773 lockdep_on();
753 configfs_remove_dir(item); 774 configfs_remove_dir(item);
754 dentry->d_inode->i_flags |= S_DEAD; 775 dentry->d_inode->i_flags |= S_DEAD;
776 lockdep_off();
755 mutex_unlock(&dentry->d_inode->i_mutex); 777 mutex_unlock(&dentry->d_inode->i_mutex);
778 lockdep_on();
756 d_delete(dentry); 779 d_delete(dentry);
757 } 780 }
758 } 781 }
@@ -787,14 +810,25 @@ static int configfs_attach_group(struct config_item *parent_item,
787 * 810 *
788 * We must also lock the inode to remove it safely in case of 811 * We must also lock the inode to remove it safely in case of
789 * error, as rmdir() would. 812 * error, as rmdir() would.
813 *
814 * Note: we hide this from lockdep since we have no way
815 * to teach lockdep about recursive
816 * I_MUTEX_PARENT -> I_MUTEX_CHILD patterns along a path
817 * in an inode tree, which are valid as soon as
818 * I_MUTEX_PARENT -> I_MUTEX_CHILD is valid from a
819 * parent inode to one of its children.
790 */ 820 */
821 lockdep_off();
791 mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD); 822 mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
823 lockdep_on();
792 ret = populate_groups(to_config_group(item)); 824 ret = populate_groups(to_config_group(item));
793 if (ret) { 825 if (ret) {
794 configfs_detach_item(item); 826 configfs_detach_item(item);
795 dentry->d_inode->i_flags |= S_DEAD; 827 dentry->d_inode->i_flags |= S_DEAD;
796 } 828 }
829 lockdep_off();
797 mutex_unlock(&dentry->d_inode->i_mutex); 830 mutex_unlock(&dentry->d_inode->i_mutex);
831 lockdep_on();
798 if (ret) 832 if (ret)
799 d_delete(dentry); 833 d_delete(dentry);
800 } 834 }
@@ -956,7 +990,17 @@ static int configfs_depend_prep(struct dentry *origin,
956 BUG_ON(!origin || !sd); 990 BUG_ON(!origin || !sd);
957 991
958 /* Lock this guy on the way down */ 992 /* Lock this guy on the way down */
993 /*
994 * Note: we hide this from lockdep since we have no way
995 * to teach lockdep about recursive
996 * I_MUTEX_PARENT -> I_MUTEX_CHILD patterns along a path
997 * in an inode tree, which are valid as soon as
998 * I_MUTEX_PARENT -> I_MUTEX_CHILD is valid from a
999 * parent inode to one of its children.
1000 */
1001 lockdep_off();
959 mutex_lock(&sd->s_dentry->d_inode->i_mutex); 1002 mutex_lock(&sd->s_dentry->d_inode->i_mutex);
1003 lockdep_on();
960 if (sd->s_element == target) /* Boo-yah */ 1004 if (sd->s_element == target) /* Boo-yah */
961 goto out; 1005 goto out;
962 1006
@@ -970,7 +1014,9 @@ static int configfs_depend_prep(struct dentry *origin,
970 } 1014 }
971 1015
972 /* We looped all our children and didn't find target */ 1016 /* We looped all our children and didn't find target */
1017 lockdep_off();
973 mutex_unlock(&sd->s_dentry->d_inode->i_mutex); 1018 mutex_unlock(&sd->s_dentry->d_inode->i_mutex);
1019 lockdep_on();
974 ret = -ENOENT; 1020 ret = -ENOENT;
975 1021
976out: 1022out:
@@ -990,11 +1036,16 @@ static void configfs_depend_rollback(struct dentry *origin,
990 struct dentry *dentry = item->ci_dentry; 1036 struct dentry *dentry = item->ci_dentry;
991 1037
992 while (dentry != origin) { 1038 while (dentry != origin) {
1039 /* See comments in configfs_depend_prep() */
1040 lockdep_off();
993 mutex_unlock(&dentry->d_inode->i_mutex); 1041 mutex_unlock(&dentry->d_inode->i_mutex);
1042 lockdep_on();
994 dentry = dentry->d_parent; 1043 dentry = dentry->d_parent;
995 } 1044 }
996 1045
1046 lockdep_off();
997 mutex_unlock(&origin->d_inode->i_mutex); 1047 mutex_unlock(&origin->d_inode->i_mutex);
1048 lockdep_on();
998} 1049}
999 1050
1000int configfs_depend_item(struct configfs_subsystem *subsys, 1051int configfs_depend_item(struct configfs_subsystem *subsys,
@@ -1329,8 +1380,16 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
1329 } 1380 }
1330 1381
1331 /* Wait until the racing operation terminates */ 1382 /* Wait until the racing operation terminates */
1383 /*
1384 * Note: we hide this from lockdep since we are locked
1385 * with subclass I_MUTEX_NORMAL from vfs_rmdir() (why
1386 * not I_MUTEX_CHILD?), and I_MUTEX_XATTR or
1387 * I_MUTEX_QUOTA are not relevant for the locked inode.
1388 */
1389 lockdep_off();
1332 mutex_lock(wait_mutex); 1390 mutex_lock(wait_mutex);
1333 mutex_unlock(wait_mutex); 1391 mutex_unlock(wait_mutex);
1392 lockdep_on();
1334 } 1393 }
1335 } while (ret == -EAGAIN); 1394 } while (ret == -EAGAIN);
1336 1395
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 4803ccc94480..5d349d38e056 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -117,8 +117,6 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr)
117static inline void set_default_inode_attr(struct inode * inode, mode_t mode) 117static inline void set_default_inode_attr(struct inode * inode, mode_t mode)
118{ 118{
119 inode->i_mode = mode; 119 inode->i_mode = mode;
120 inode->i_uid = 0;
121 inode->i_gid = 0;
122 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 120 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
123} 121}
124 122
@@ -136,7 +134,6 @@ struct inode * configfs_new_inode(mode_t mode, struct configfs_dirent * sd)
136{ 134{
137 struct inode * inode = new_inode(configfs_sb); 135 struct inode * inode = new_inode(configfs_sb);
138 if (inode) { 136 if (inode) {
139 inode->i_blocks = 0;
140 inode->i_mapping->a_ops = &configfs_aops; 137 inode->i_mapping->a_ops = &configfs_aops;
141 inode->i_mapping->backing_dev_info = &configfs_backing_dev_info; 138 inode->i_mapping->backing_dev_info = &configfs_backing_dev_info;
142 inode->i_op = &configfs_inode_operations; 139 inode->i_op = &configfs_inode_operations;
diff --git a/fs/cramfs/Kconfig b/fs/cramfs/Kconfig
new file mode 100644
index 000000000000..cd06466f365e
--- /dev/null
+++ b/fs/cramfs/Kconfig
@@ -0,0 +1,19 @@
1config CRAMFS
2 tristate "Compressed ROM file system support (cramfs)"
3 depends on BLOCK
4 select ZLIB_INFLATE
5 help
6 Saying Y here includes support for CramFs (Compressed ROM File
7 System). CramFs is designed to be a simple, small, and compressed
8 file system for ROM based embedded systems. CramFs is read-only,
9 limited to 256MB file systems (with 16MB files), and doesn't support
10 16/32 bits uid/gid, hard links and timestamps.
11
12 See <file:Documentation/filesystems/cramfs.txt> and
13 <file:fs/cramfs/README> for further information.
14
15 To compile this as a module, choose M here: the module will be called
16 cramfs. Note that the root file system (the one containing the
17 directory /) cannot be compiled as a module.
18
19 If unsure, say N.
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index f40423eb1a14..a07338d2d140 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -83,8 +83,6 @@ static struct inode *get_cramfs_inode(struct super_block *sb,
83 inode->i_op = &page_symlink_inode_operations; 83 inode->i_op = &page_symlink_inode_operations;
84 inode->i_data.a_ops = &cramfs_aops; 84 inode->i_data.a_ops = &cramfs_aops;
85 } else { 85 } else {
86 inode->i_size = 0;
87 inode->i_blocks = 0;
88 init_special_inode(inode, inode->i_mode, 86 init_special_inode(inode, inode->i_mode,
89 old_decode_dev(cramfs_inode->size)); 87 old_decode_dev(cramfs_inode->size));
90 } 88 }
diff --git a/fs/dcache.c b/fs/dcache.c
index a1d86c7f3e66..937df0fb0da5 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -34,7 +34,6 @@
34#include <linux/bootmem.h> 34#include <linux/bootmem.h>
35#include "internal.h" 35#include "internal.h"
36 36
37
38int sysctl_vfs_cache_pressure __read_mostly = 100; 37int sysctl_vfs_cache_pressure __read_mostly = 100;
39EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure); 38EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure);
40 39
@@ -948,9 +947,6 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
948 dentry->d_op = NULL; 947 dentry->d_op = NULL;
949 dentry->d_fsdata = NULL; 948 dentry->d_fsdata = NULL;
950 dentry->d_mounted = 0; 949 dentry->d_mounted = 0;
951#ifdef CONFIG_PROFILING
952 dentry->d_cookie = NULL;
953#endif
954 INIT_HLIST_NODE(&dentry->d_hash); 950 INIT_HLIST_NODE(&dentry->d_hash);
955 INIT_LIST_HEAD(&dentry->d_lru); 951 INIT_LIST_HEAD(&dentry->d_lru);
956 INIT_LIST_HEAD(&dentry->d_subdirs); 952 INIT_LIST_HEAD(&dentry->d_subdirs);
@@ -1336,7 +1332,7 @@ err_out:
1336 * 1332 *
1337 * Searches the children of the parent dentry for the name in question. If 1333 * Searches the children of the parent dentry for the name in question. If
1338 * the dentry is found its reference count is incremented and the dentry 1334 * the dentry is found its reference count is incremented and the dentry
1339 * is returned. The caller must use d_put to free the entry when it has 1335 * is returned. The caller must use dput to free the entry when it has
1340 * finished using it. %NULL is returned on failure. 1336 * finished using it. %NULL is returned on failure.
1341 * 1337 *
1342 * __d_lookup is dcache_lock free. The hash list is protected using RCU. 1338 * __d_lookup is dcache_lock free. The hash list is protected using RCU.
@@ -1571,10 +1567,6 @@ void d_rehash(struct dentry * entry)
1571 spin_unlock(&dcache_lock); 1567 spin_unlock(&dcache_lock);
1572} 1568}
1573 1569
1574#define do_switch(x,y) do { \
1575 __typeof__ (x) __tmp = x; \
1576 x = y; y = __tmp; } while (0)
1577
1578/* 1570/*
1579 * When switching names, the actual string doesn't strictly have to 1571 * When switching names, the actual string doesn't strictly have to
1580 * be preserved in the target - because we're dropping the target 1572 * be preserved in the target - because we're dropping the target
@@ -1593,7 +1585,7 @@ static void switch_names(struct dentry *dentry, struct dentry *target)
1593 /* 1585 /*
1594 * Both external: swap the pointers 1586 * Both external: swap the pointers
1595 */ 1587 */
1596 do_switch(target->d_name.name, dentry->d_name.name); 1588 swap(target->d_name.name, dentry->d_name.name);
1597 } else { 1589 } else {
1598 /* 1590 /*
1599 * dentry:internal, target:external. Steal target's 1591 * dentry:internal, target:external. Steal target's
@@ -1620,8 +1612,11 @@ static void switch_names(struct dentry *dentry, struct dentry *target)
1620 */ 1612 */
1621 memcpy(dentry->d_iname, target->d_name.name, 1613 memcpy(dentry->d_iname, target->d_name.name,
1622 target->d_name.len + 1); 1614 target->d_name.len + 1);
1615 dentry->d_name.len = target->d_name.len;
1616 return;
1623 } 1617 }
1624 } 1618 }
1619 swap(dentry->d_name.len, target->d_name.len);
1625} 1620}
1626 1621
1627/* 1622/*
@@ -1681,8 +1676,7 @@ already_unhashed:
1681 1676
1682 /* Switch the names.. */ 1677 /* Switch the names.. */
1683 switch_names(dentry, target); 1678 switch_names(dentry, target);
1684 do_switch(dentry->d_name.len, target->d_name.len); 1679 swap(dentry->d_name.hash, target->d_name.hash);
1685 do_switch(dentry->d_name.hash, target->d_name.hash);
1686 1680
1687 /* ... and switch the parents */ 1681 /* ... and switch the parents */
1688 if (IS_ROOT(dentry)) { 1682 if (IS_ROOT(dentry)) {
@@ -1690,7 +1684,7 @@ already_unhashed:
1690 target->d_parent = target; 1684 target->d_parent = target;
1691 INIT_LIST_HEAD(&target->d_u.d_child); 1685 INIT_LIST_HEAD(&target->d_u.d_child);
1692 } else { 1686 } else {
1693 do_switch(dentry->d_parent, target->d_parent); 1687 swap(dentry->d_parent, target->d_parent);
1694 1688
1695 /* And add them back to the (new) parent lists */ 1689 /* And add them back to the (new) parent lists */
1696 list_add(&target->d_u.d_child, &target->d_parent->d_subdirs); 1690 list_add(&target->d_u.d_child, &target->d_parent->d_subdirs);
@@ -1791,8 +1785,7 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
1791 struct dentry *dparent, *aparent; 1785 struct dentry *dparent, *aparent;
1792 1786
1793 switch_names(dentry, anon); 1787 switch_names(dentry, anon);
1794 do_switch(dentry->d_name.len, anon->d_name.len); 1788 swap(dentry->d_name.hash, anon->d_name.hash);
1795 do_switch(dentry->d_name.hash, anon->d_name.hash);
1796 1789
1797 dparent = dentry->d_parent; 1790 dparent = dentry->d_parent;
1798 aparent = anon->d_parent; 1791 aparent = anon->d_parent;
@@ -1911,7 +1904,8 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name)
1911 * Convert a dentry into an ASCII path name. If the entry has been deleted 1904 * Convert a dentry into an ASCII path name. If the entry has been deleted
1912 * the string " (deleted)" is appended. Note that this is ambiguous. 1905 * the string " (deleted)" is appended. Note that this is ambiguous.
1913 * 1906 *
1914 * Returns the buffer or an error code if the path was too long. 1907 * Returns a pointer into the buffer or an error code if the
1908 * path was too long.
1915 * 1909 *
1916 * "buflen" should be positive. Caller holds the dcache_lock. 1910 * "buflen" should be positive. Caller holds the dcache_lock.
1917 * 1911 *
@@ -1987,7 +1981,10 @@ Elong:
1987 * Convert a dentry into an ASCII path name. If the entry has been deleted 1981 * Convert a dentry into an ASCII path name. If the entry has been deleted
1988 * the string " (deleted)" is appended. Note that this is ambiguous. 1982 * the string " (deleted)" is appended. Note that this is ambiguous.
1989 * 1983 *
1990 * Returns the buffer or an error code if the path was too long. 1984 * Returns a pointer into the buffer or an error code if the path was
1985 * too long. Note: Callers should use the returned pointer, not the passed
1986 * in buffer, to use the name! The implementation often starts at an offset
1987 * into the buffer, and may leave 0 bytes at the start.
1991 * 1988 *
1992 * "buflen" should be positive. 1989 * "buflen" should be positive.
1993 */ 1990 */
@@ -2095,7 +2092,7 @@ Elong:
2095 * return NULL; 2092 * return NULL;
2096 * } 2093 * }
2097 */ 2094 */
2098asmlinkage long sys_getcwd(char __user *buf, unsigned long size) 2095SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
2099{ 2096{
2100 int error; 2097 int error;
2101 struct path pwd, root; 2098 struct path pwd, root;
@@ -2313,9 +2310,6 @@ static void __init dcache_init(void)
2313/* SLAB cache for __getname() consumers */ 2310/* SLAB cache for __getname() consumers */
2314struct kmem_cache *names_cachep __read_mostly; 2311struct kmem_cache *names_cachep __read_mostly;
2315 2312
2316/* SLAB cache for file structures */
2317struct kmem_cache *filp_cachep __read_mostly;
2318
2319EXPORT_SYMBOL(d_genocide); 2313EXPORT_SYMBOL(d_genocide);
2320 2314
2321void __init vfs_caches_init_early(void) 2315void __init vfs_caches_init_early(void)
@@ -2337,9 +2331,6 @@ void __init vfs_caches_init(unsigned long mempages)
2337 names_cachep = kmem_cache_create("names_cache", PATH_MAX, 0, 2331 names_cachep = kmem_cache_create("names_cache", PATH_MAX, 0,
2338 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 2332 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2339 2333
2340 filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
2341 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2342
2343 dcache_init(); 2334 dcache_init();
2344 inode_init(); 2335 inode_init();
2345 files_init(mempages); 2336 files_init(mempages);
diff --git a/fs/dcookies.c b/fs/dcookies.c
index 855d4b1d619a..a21cabdbd87b 100644
--- a/fs/dcookies.c
+++ b/fs/dcookies.c
@@ -93,10 +93,15 @@ static struct dcookie_struct *alloc_dcookie(struct path *path)
93{ 93{
94 struct dcookie_struct *dcs = kmem_cache_alloc(dcookie_cache, 94 struct dcookie_struct *dcs = kmem_cache_alloc(dcookie_cache,
95 GFP_KERNEL); 95 GFP_KERNEL);
96 struct dentry *d;
96 if (!dcs) 97 if (!dcs)
97 return NULL; 98 return NULL;
98 99
99 path->dentry->d_cookie = dcs; 100 d = path->dentry;
101 spin_lock(&d->d_lock);
102 d->d_flags |= DCACHE_COOKIE;
103 spin_unlock(&d->d_lock);
104
100 dcs->path = *path; 105 dcs->path = *path;
101 path_get(path); 106 path_get(path);
102 hash_dcookie(dcs); 107 hash_dcookie(dcs);
@@ -119,14 +124,14 @@ int get_dcookie(struct path *path, unsigned long *cookie)
119 goto out; 124 goto out;
120 } 125 }
121 126
122 dcs = path->dentry->d_cookie; 127 if (path->dentry->d_flags & DCACHE_COOKIE) {
123 128 dcs = find_dcookie((unsigned long)path->dentry);
124 if (!dcs) 129 } else {
125 dcs = alloc_dcookie(path); 130 dcs = alloc_dcookie(path);
126 131 if (!dcs) {
127 if (!dcs) { 132 err = -ENOMEM;
128 err = -ENOMEM; 133 goto out;
129 goto out; 134 }
130 } 135 }
131 136
132 *cookie = dcookie_value(dcs); 137 *cookie = dcookie_value(dcs);
@@ -140,7 +145,7 @@ out:
140/* And here is where the userspace process can look up the cookie value 145/* And here is where the userspace process can look up the cookie value
141 * to retrieve the path. 146 * to retrieve the path.
142 */ 147 */
143asmlinkage long sys_lookup_dcookie(u64 cookie64, char __user * buf, size_t len) 148SYSCALL_DEFINE(lookup_dcookie)(u64 cookie64, char __user * buf, size_t len)
144{ 149{
145 unsigned long cookie = (unsigned long)cookie64; 150 unsigned long cookie = (unsigned long)cookie64;
146 int err = -EINVAL; 151 int err = -EINVAL;
@@ -193,7 +198,13 @@ out:
193 mutex_unlock(&dcookie_mutex); 198 mutex_unlock(&dcookie_mutex);
194 return err; 199 return err;
195} 200}
196 201#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
202asmlinkage long SyS_lookup_dcookie(u64 cookie64, long buf, long len)
203{
204 return SYSC_lookup_dcookie(cookie64, (char __user *) buf, (size_t) len);
205}
206SYSCALL_ALIAS(sys_lookup_dcookie, SyS_lookup_dcookie);
207#endif
197 208
198static int dcookie_init(void) 209static int dcookie_init(void)
199{ 210{
@@ -251,7 +262,12 @@ out_kmem:
251 262
252static void free_dcookie(struct dcookie_struct * dcs) 263static void free_dcookie(struct dcookie_struct * dcs)
253{ 264{
254 dcs->path.dentry->d_cookie = NULL; 265 struct dentry *d = dcs->path.dentry;
266
267 spin_lock(&d->d_lock);
268 d->d_flags &= ~DCACHE_COOKIE;
269 spin_unlock(&d->d_lock);
270
255 path_put(&dcs->path); 271 path_put(&dcs->path);
256 kmem_cache_free(dcookie_cache, dcs); 272 kmem_cache_free(dcookie_cache, dcs);
257} 273}
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 159a5efd6a8a..33a90120f6ad 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -294,6 +294,38 @@ struct dentry *debugfs_create_x32(const char *name, mode_t mode,
294} 294}
295EXPORT_SYMBOL_GPL(debugfs_create_x32); 295EXPORT_SYMBOL_GPL(debugfs_create_x32);
296 296
297
298static int debugfs_size_t_set(void *data, u64 val)
299{
300 *(size_t *)data = val;
301 return 0;
302}
303static int debugfs_size_t_get(void *data, u64 *val)
304{
305 *val = *(size_t *)data;
306 return 0;
307}
308DEFINE_SIMPLE_ATTRIBUTE(fops_size_t, debugfs_size_t_get, debugfs_size_t_set,
309 "%llu\n"); /* %llu and %zu are more or less the same */
310
311/**
312 * debugfs_create_size_t - create a debugfs file that is used to read and write an size_t value
313 * @name: a pointer to a string containing the name of the file to create.
314 * @mode: the permission that the file should have
315 * @parent: a pointer to the parent dentry for this file. This should be a
316 * directory dentry if set. If this parameter is %NULL, then the
317 * file will be created in the root of the debugfs filesystem.
318 * @value: a pointer to the variable that the file should read to and write
319 * from.
320 */
321struct dentry *debugfs_create_size_t(const char *name, mode_t mode,
322 struct dentry *parent, size_t *value)
323{
324 return debugfs_create_file(name, mode, parent, value, &fops_size_t);
325}
326EXPORT_SYMBOL_GPL(debugfs_create_size_t);
327
328
297static ssize_t read_file_bool(struct file *file, char __user *user_buf, 329static ssize_t read_file_bool(struct file *file, char __user *user_buf,
298 size_t count, loff_t *ppos) 330 size_t count, loff_t *ppos)
299{ 331{
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 3dbe2169cf36..81ae9ea3c6e1 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -37,9 +37,6 @@ static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t d
37 37
38 if (inode) { 38 if (inode) {
39 inode->i_mode = mode; 39 inode->i_mode = mode;
40 inode->i_uid = 0;
41 inode->i_gid = 0;
42 inode->i_blocks = 0;
43 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 40 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
44 switch (mode & S_IFMT) { 41 switch (mode & S_IFMT) {
45 default: 42 default:
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 5d61b7c06e13..5f3231b9633f 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -27,25 +27,32 @@
27#define DEVPTS_SUPER_MAGIC 0x1cd1 27#define DEVPTS_SUPER_MAGIC 0x1cd1
28 28
29#define DEVPTS_DEFAULT_MODE 0600 29#define DEVPTS_DEFAULT_MODE 0600
30/*
31 * ptmx is a new node in /dev/pts and will be unused in legacy (single-
32 * instance) mode. To prevent surprises in user space, set permissions of
33 * ptmx to 0. Use 'chmod' or remount with '-o ptmxmode' to set meaningful
34 * permissions.
35 */
36#define DEVPTS_DEFAULT_PTMX_MODE 0000
30#define PTMX_MINOR 2 37#define PTMX_MINOR 2
31 38
32extern int pty_limit; /* Config limit on Unix98 ptys */ 39extern int pty_limit; /* Config limit on Unix98 ptys */
33static DEFINE_IDA(allocated_ptys);
34static DEFINE_MUTEX(allocated_ptys_lock); 40static DEFINE_MUTEX(allocated_ptys_lock);
35 41
36static struct vfsmount *devpts_mnt; 42static struct vfsmount *devpts_mnt;
37static struct dentry *devpts_root;
38 43
39static struct { 44struct pts_mount_opts {
40 int setuid; 45 int setuid;
41 int setgid; 46 int setgid;
42 uid_t uid; 47 uid_t uid;
43 gid_t gid; 48 gid_t gid;
44 umode_t mode; 49 umode_t mode;
45} config = {.mode = DEVPTS_DEFAULT_MODE}; 50 umode_t ptmxmode;
51 int newinstance;
52};
46 53
47enum { 54enum {
48 Opt_uid, Opt_gid, Opt_mode, 55 Opt_uid, Opt_gid, Opt_mode, Opt_ptmxmode, Opt_newinstance,
49 Opt_err 56 Opt_err
50}; 57};
51 58
@@ -53,18 +60,50 @@ static const match_table_t tokens = {
53 {Opt_uid, "uid=%u"}, 60 {Opt_uid, "uid=%u"},
54 {Opt_gid, "gid=%u"}, 61 {Opt_gid, "gid=%u"},
55 {Opt_mode, "mode=%o"}, 62 {Opt_mode, "mode=%o"},
63#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
64 {Opt_ptmxmode, "ptmxmode=%o"},
65 {Opt_newinstance, "newinstance"},
66#endif
56 {Opt_err, NULL} 67 {Opt_err, NULL}
57}; 68};
58 69
59static int devpts_remount(struct super_block *sb, int *flags, char *data) 70struct pts_fs_info {
71 struct ida allocated_ptys;
72 struct pts_mount_opts mount_opts;
73 struct dentry *ptmx_dentry;
74};
75
76static inline struct pts_fs_info *DEVPTS_SB(struct super_block *sb)
77{
78 return sb->s_fs_info;
79}
80
81static inline struct super_block *pts_sb_from_inode(struct inode *inode)
82{
83#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
84 if (inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC)
85 return inode->i_sb;
86#endif
87 return devpts_mnt->mnt_sb;
88}
89
90#define PARSE_MOUNT 0
91#define PARSE_REMOUNT 1
92
93static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)
60{ 94{
61 char *p; 95 char *p;
62 96
63 config.setuid = 0; 97 opts->setuid = 0;
64 config.setgid = 0; 98 opts->setgid = 0;
65 config.uid = 0; 99 opts->uid = 0;
66 config.gid = 0; 100 opts->gid = 0;
67 config.mode = DEVPTS_DEFAULT_MODE; 101 opts->mode = DEVPTS_DEFAULT_MODE;
102 opts->ptmxmode = DEVPTS_DEFAULT_PTMX_MODE;
103
104 /* newinstance makes sense only on initial mount */
105 if (op == PARSE_MOUNT)
106 opts->newinstance = 0;
68 107
69 while ((p = strsep(&data, ",")) != NULL) { 108 while ((p = strsep(&data, ",")) != NULL) {
70 substring_t args[MAX_OPT_ARGS]; 109 substring_t args[MAX_OPT_ARGS];
@@ -79,20 +118,32 @@ static int devpts_remount(struct super_block *sb, int *flags, char *data)
79 case Opt_uid: 118 case Opt_uid:
80 if (match_int(&args[0], &option)) 119 if (match_int(&args[0], &option))
81 return -EINVAL; 120 return -EINVAL;
82 config.uid = option; 121 opts->uid = option;
83 config.setuid = 1; 122 opts->setuid = 1;
84 break; 123 break;
85 case Opt_gid: 124 case Opt_gid:
86 if (match_int(&args[0], &option)) 125 if (match_int(&args[0], &option))
87 return -EINVAL; 126 return -EINVAL;
88 config.gid = option; 127 opts->gid = option;
89 config.setgid = 1; 128 opts->setgid = 1;
90 break; 129 break;
91 case Opt_mode: 130 case Opt_mode:
92 if (match_octal(&args[0], &option)) 131 if (match_octal(&args[0], &option))
93 return -EINVAL; 132 return -EINVAL;
94 config.mode = option & S_IALLUGO; 133 opts->mode = option & S_IALLUGO;
134 break;
135#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
136 case Opt_ptmxmode:
137 if (match_octal(&args[0], &option))
138 return -EINVAL;
139 opts->ptmxmode = option & S_IALLUGO;
140 break;
141 case Opt_newinstance:
142 /* newinstance makes sense only on initial mount */
143 if (op == PARSE_MOUNT)
144 opts->newinstance = 1;
95 break; 145 break;
146#endif
96 default: 147 default:
97 printk(KERN_ERR "devpts: called with bogus options\n"); 148 printk(KERN_ERR "devpts: called with bogus options\n");
98 return -EINVAL; 149 return -EINVAL;
@@ -102,13 +153,106 @@ static int devpts_remount(struct super_block *sb, int *flags, char *data)
102 return 0; 153 return 0;
103} 154}
104 155
156#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
157static int mknod_ptmx(struct super_block *sb)
158{
159 int mode;
160 int rc = -ENOMEM;
161 struct dentry *dentry;
162 struct inode *inode;
163 struct dentry *root = sb->s_root;
164 struct pts_fs_info *fsi = DEVPTS_SB(sb);
165 struct pts_mount_opts *opts = &fsi->mount_opts;
166
167 mutex_lock(&root->d_inode->i_mutex);
168
169 /* If we have already created ptmx node, return */
170 if (fsi->ptmx_dentry) {
171 rc = 0;
172 goto out;
173 }
174
175 dentry = d_alloc_name(root, "ptmx");
176 if (!dentry) {
177 printk(KERN_NOTICE "Unable to alloc dentry for ptmx node\n");
178 goto out;
179 }
180
181 /*
182 * Create a new 'ptmx' node in this mount of devpts.
183 */
184 inode = new_inode(sb);
185 if (!inode) {
186 printk(KERN_ERR "Unable to alloc inode for ptmx node\n");
187 dput(dentry);
188 goto out;
189 }
190
191 inode->i_ino = 2;
192 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
193
194 mode = S_IFCHR|opts->ptmxmode;
195 init_special_inode(inode, mode, MKDEV(TTYAUX_MAJOR, 2));
196
197 d_add(dentry, inode);
198
199 fsi->ptmx_dentry = dentry;
200 rc = 0;
201
202 printk(KERN_DEBUG "Created ptmx node in devpts ino %lu\n",
203 inode->i_ino);
204out:
205 mutex_unlock(&root->d_inode->i_mutex);
206 return rc;
207}
208
209static void update_ptmx_mode(struct pts_fs_info *fsi)
210{
211 struct inode *inode;
212 if (fsi->ptmx_dentry) {
213 inode = fsi->ptmx_dentry->d_inode;
214 inode->i_mode = S_IFCHR|fsi->mount_opts.ptmxmode;
215 }
216}
217#else
218static inline void update_ptmx_mode(struct pts_fs_info *fsi)
219{
220 return;
221}
222#endif
223
224static int devpts_remount(struct super_block *sb, int *flags, char *data)
225{
226 int err;
227 struct pts_fs_info *fsi = DEVPTS_SB(sb);
228 struct pts_mount_opts *opts = &fsi->mount_opts;
229
230 err = parse_mount_options(data, PARSE_REMOUNT, opts);
231
232 /*
233 * parse_mount_options() restores options to default values
234 * before parsing and may have changed ptmxmode. So, update the
235 * mode in the inode too. Bogus options don't fail the remount,
236 * so do this even on error return.
237 */
238 update_ptmx_mode(fsi);
239
240 return err;
241}
242
105static int devpts_show_options(struct seq_file *seq, struct vfsmount *vfs) 243static int devpts_show_options(struct seq_file *seq, struct vfsmount *vfs)
106{ 244{
107 if (config.setuid) 245 struct pts_fs_info *fsi = DEVPTS_SB(vfs->mnt_sb);
108 seq_printf(seq, ",uid=%u", config.uid); 246 struct pts_mount_opts *opts = &fsi->mount_opts;
109 if (config.setgid) 247
110 seq_printf(seq, ",gid=%u", config.gid); 248 if (opts->setuid)
111 seq_printf(seq, ",mode=%03o", config.mode); 249 seq_printf(seq, ",uid=%u", opts->uid);
250 if (opts->setgid)
251 seq_printf(seq, ",gid=%u", opts->gid);
252 seq_printf(seq, ",mode=%03o", opts->mode);
253#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
254 seq_printf(seq, ",ptmxmode=%03o", opts->ptmxmode);
255#endif
112 256
113 return 0; 257 return 0;
114} 258}
@@ -119,10 +263,25 @@ static const struct super_operations devpts_sops = {
119 .show_options = devpts_show_options, 263 .show_options = devpts_show_options,
120}; 264};
121 265
266static void *new_pts_fs_info(void)
267{
268 struct pts_fs_info *fsi;
269
270 fsi = kzalloc(sizeof(struct pts_fs_info), GFP_KERNEL);
271 if (!fsi)
272 return NULL;
273
274 ida_init(&fsi->allocated_ptys);
275 fsi->mount_opts.mode = DEVPTS_DEFAULT_MODE;
276 fsi->mount_opts.ptmxmode = DEVPTS_DEFAULT_PTMX_MODE;
277
278 return fsi;
279}
280
122static int 281static int
123devpts_fill_super(struct super_block *s, void *data, int silent) 282devpts_fill_super(struct super_block *s, void *data, int silent)
124{ 283{
125 struct inode * inode; 284 struct inode *inode;
126 285
127 s->s_blocksize = 1024; 286 s->s_blocksize = 1024;
128 s->s_blocksize_bits = 10; 287 s->s_blocksize_bits = 10;
@@ -130,39 +289,240 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
130 s->s_op = &devpts_sops; 289 s->s_op = &devpts_sops;
131 s->s_time_gran = 1; 290 s->s_time_gran = 1;
132 291
292 s->s_fs_info = new_pts_fs_info();
293 if (!s->s_fs_info)
294 goto fail;
295
133 inode = new_inode(s); 296 inode = new_inode(s);
134 if (!inode) 297 if (!inode)
135 goto fail; 298 goto free_fsi;
136 inode->i_ino = 1; 299 inode->i_ino = 1;
137 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 300 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
138 inode->i_blocks = 0;
139 inode->i_uid = inode->i_gid = 0;
140 inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR; 301 inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR;
141 inode->i_op = &simple_dir_inode_operations; 302 inode->i_op = &simple_dir_inode_operations;
142 inode->i_fop = &simple_dir_operations; 303 inode->i_fop = &simple_dir_operations;
143 inode->i_nlink = 2; 304 inode->i_nlink = 2;
144 305
145 devpts_root = s->s_root = d_alloc_root(inode); 306 s->s_root = d_alloc_root(inode);
146 if (s->s_root) 307 if (s->s_root)
147 return 0; 308 return 0;
148 309
149 printk("devpts: get root dentry failed\n"); 310 printk(KERN_ERR "devpts: get root dentry failed\n");
150 iput(inode); 311 iput(inode);
312
313free_fsi:
314 kfree(s->s_fs_info);
151fail: 315fail:
152 return -ENOMEM; 316 return -ENOMEM;
153} 317}
154 318
319#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
320static int compare_init_pts_sb(struct super_block *s, void *p)
321{
322 if (devpts_mnt)
323 return devpts_mnt->mnt_sb == s;
324 return 0;
325}
326
327/*
328 * Safely parse the mount options in @data and update @opts.
329 *
330 * devpts ends up parsing options two times during mount, due to the
331 * two modes of operation it supports. The first parse occurs in
332 * devpts_get_sb() when determining the mode (single-instance or
333 * multi-instance mode). The second parse happens in devpts_remount()
334 * or new_pts_mount() depending on the mode.
335 *
336 * Parsing of options modifies the @data making subsequent parsing
337 * incorrect. So make a local copy of @data and parse it.
338 *
339 * Return: 0 On success, -errno on error
340 */
341static int safe_parse_mount_options(void *data, struct pts_mount_opts *opts)
342{
343 int rc;
344 void *datacp;
345
346 if (!data)
347 return 0;
348
349 /* Use kstrdup() ? */
350 datacp = kmalloc(PAGE_SIZE, GFP_KERNEL);
351 if (!datacp)
352 return -ENOMEM;
353
354 memcpy(datacp, data, PAGE_SIZE);
355 rc = parse_mount_options((char *)datacp, PARSE_MOUNT, opts);
356 kfree(datacp);
357
358 return rc;
359}
360
361/*
362 * Mount a new (private) instance of devpts. PTYs created in this
363 * instance are independent of the PTYs in other devpts instances.
364 */
365static int new_pts_mount(struct file_system_type *fs_type, int flags,
366 void *data, struct vfsmount *mnt)
367{
368 int err;
369 struct pts_fs_info *fsi;
370 struct pts_mount_opts *opts;
371
372 printk(KERN_NOTICE "devpts: newinstance mount\n");
373
374 err = get_sb_nodev(fs_type, flags, data, devpts_fill_super, mnt);
375 if (err)
376 return err;
377
378 fsi = DEVPTS_SB(mnt->mnt_sb);
379 opts = &fsi->mount_opts;
380
381 err = parse_mount_options(data, PARSE_MOUNT, opts);
382 if (err)
383 goto fail;
384
385 err = mknod_ptmx(mnt->mnt_sb);
386 if (err)
387 goto fail;
388
389 return 0;
390
391fail:
392 dput(mnt->mnt_sb->s_root);
393 deactivate_super(mnt->mnt_sb);
394 return err;
395}
396
397/*
398 * Check if 'newinstance' mount option was specified in @data.
399 *
400 * Return: -errno on error (eg: invalid mount options specified)
401 * : 1 if 'newinstance' mount option was specified
402 * : 0 if 'newinstance' mount option was NOT specified
403 */
404static int is_new_instance_mount(void *data)
405{
406 int rc;
407 struct pts_mount_opts opts;
408
409 if (!data)
410 return 0;
411
412 rc = safe_parse_mount_options(data, &opts);
413 if (!rc)
414 rc = opts.newinstance;
415
416 return rc;
417}
418
419/*
420 * get_init_pts_sb()
421 *
422 * This interface is needed to support multiple namespace semantics in
423 * devpts while preserving backward compatibility of the current 'single-
424 * namespace' semantics. i.e all mounts of devpts without the 'newinstance'
425 * mount option should bind to the initial kernel mount, like
426 * get_sb_single().
427 *
428 * Mounts with 'newinstance' option create a new private namespace.
429 *
430 * But for single-mount semantics, devpts cannot use get_sb_single(),
431 * because get_sb_single()/sget() find and use the super-block from
432 * the most recent mount of devpts. But that recent mount may be a
433 * 'newinstance' mount and get_sb_single() would pick the newinstance
434 * super-block instead of the initial super-block.
435 *
436 * This interface is identical to get_sb_single() except that it
437 * consistently selects the 'single-namespace' superblock even in the
438 * presence of the private namespace (i.e 'newinstance') super-blocks.
439 */
440static int get_init_pts_sb(struct file_system_type *fs_type, int flags,
441 void *data, struct vfsmount *mnt)
442{
443 struct super_block *s;
444 int error;
445
446 s = sget(fs_type, compare_init_pts_sb, set_anon_super, NULL);
447 if (IS_ERR(s))
448 return PTR_ERR(s);
449
450 if (!s->s_root) {
451 s->s_flags = flags;
452 error = devpts_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
453 if (error) {
454 up_write(&s->s_umount);
455 deactivate_super(s);
456 return error;
457 }
458 s->s_flags |= MS_ACTIVE;
459 }
460 do_remount_sb(s, flags, data, 0);
461 return simple_set_mnt(mnt, s);
462}
463
464/*
465 * Mount or remount the initial kernel mount of devpts. This type of
466 * mount maintains the legacy, single-instance semantics, while the
467 * kernel still allows multiple-instances.
468 */
469static int init_pts_mount(struct file_system_type *fs_type, int flags,
470 void *data, struct vfsmount *mnt)
471{
472 int err;
473
474 err = get_init_pts_sb(fs_type, flags, data, mnt);
475 if (err)
476 return err;
477
478 err = mknod_ptmx(mnt->mnt_sb);
479 if (err) {
480 dput(mnt->mnt_sb->s_root);
481 deactivate_super(mnt->mnt_sb);
482 }
483
484 return err;
485}
486
155static int devpts_get_sb(struct file_system_type *fs_type, 487static int devpts_get_sb(struct file_system_type *fs_type,
156 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 488 int flags, const char *dev_name, void *data, struct vfsmount *mnt)
157{ 489{
490 int new;
491
492 new = is_new_instance_mount(data);
493 if (new < 0)
494 return new;
495
496 if (new)
497 return new_pts_mount(fs_type, flags, data, mnt);
498
499 return init_pts_mount(fs_type, flags, data, mnt);
500}
501#else
502/*
503 * This supports only the legacy single-instance semantics (no
504 * multiple-instance semantics)
505 */
506static int devpts_get_sb(struct file_system_type *fs_type, int flags,
507 const char *dev_name, void *data, struct vfsmount *mnt)
508{
158 return get_sb_single(fs_type, flags, data, devpts_fill_super, mnt); 509 return get_sb_single(fs_type, flags, data, devpts_fill_super, mnt);
159} 510}
511#endif
512
513static void devpts_kill_sb(struct super_block *sb)
514{
515 struct pts_fs_info *fsi = DEVPTS_SB(sb);
516
517 kfree(fsi);
518 kill_litter_super(sb);
519}
160 520
161static struct file_system_type devpts_fs_type = { 521static struct file_system_type devpts_fs_type = {
162 .owner = THIS_MODULE, 522 .owner = THIS_MODULE,
163 .name = "devpts", 523 .name = "devpts",
164 .get_sb = devpts_get_sb, 524 .get_sb = devpts_get_sb,
165 .kill_sb = kill_anon_super, 525 .kill_sb = devpts_kill_sb,
166}; 526};
167 527
168/* 528/*
@@ -172,16 +532,17 @@ static struct file_system_type devpts_fs_type = {
172 532
173int devpts_new_index(struct inode *ptmx_inode) 533int devpts_new_index(struct inode *ptmx_inode)
174{ 534{
535 struct super_block *sb = pts_sb_from_inode(ptmx_inode);
536 struct pts_fs_info *fsi = DEVPTS_SB(sb);
175 int index; 537 int index;
176 int ida_ret; 538 int ida_ret;
177 539
178retry: 540retry:
179 if (!ida_pre_get(&allocated_ptys, GFP_KERNEL)) { 541 if (!ida_pre_get(&fsi->allocated_ptys, GFP_KERNEL))
180 return -ENOMEM; 542 return -ENOMEM;
181 }
182 543
183 mutex_lock(&allocated_ptys_lock); 544 mutex_lock(&allocated_ptys_lock);
184 ida_ret = ida_get_new(&allocated_ptys, &index); 545 ida_ret = ida_get_new(&fsi->allocated_ptys, &index);
185 if (ida_ret < 0) { 546 if (ida_ret < 0) {
186 mutex_unlock(&allocated_ptys_lock); 547 mutex_unlock(&allocated_ptys_lock);
187 if (ida_ret == -EAGAIN) 548 if (ida_ret == -EAGAIN)
@@ -190,7 +551,7 @@ retry:
190 } 551 }
191 552
192 if (index >= pty_limit) { 553 if (index >= pty_limit) {
193 ida_remove(&allocated_ptys, index); 554 ida_remove(&fsi->allocated_ptys, index);
194 mutex_unlock(&allocated_ptys_lock); 555 mutex_unlock(&allocated_ptys_lock);
195 return -EIO; 556 return -EIO;
196 } 557 }
@@ -200,18 +561,26 @@ retry:
200 561
201void devpts_kill_index(struct inode *ptmx_inode, int idx) 562void devpts_kill_index(struct inode *ptmx_inode, int idx)
202{ 563{
564 struct super_block *sb = pts_sb_from_inode(ptmx_inode);
565 struct pts_fs_info *fsi = DEVPTS_SB(sb);
566
203 mutex_lock(&allocated_ptys_lock); 567 mutex_lock(&allocated_ptys_lock);
204 ida_remove(&allocated_ptys, idx); 568 ida_remove(&fsi->allocated_ptys, idx);
205 mutex_unlock(&allocated_ptys_lock); 569 mutex_unlock(&allocated_ptys_lock);
206} 570}
207 571
208int devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty) 572int devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty)
209{ 573{
210 int number = tty->index; /* tty layer puts index from devpts_new_index() in here */ 574 /* tty layer puts index from devpts_new_index() in here */
575 int number = tty->index;
211 struct tty_driver *driver = tty->driver; 576 struct tty_driver *driver = tty->driver;
212 dev_t device = MKDEV(driver->major, driver->minor_start+number); 577 dev_t device = MKDEV(driver->major, driver->minor_start+number);
213 struct dentry *dentry; 578 struct dentry *dentry;
214 struct inode *inode = new_inode(devpts_mnt->mnt_sb); 579 struct super_block *sb = pts_sb_from_inode(ptmx_inode);
580 struct inode *inode = new_inode(sb);
581 struct dentry *root = sb->s_root;
582 struct pts_fs_info *fsi = DEVPTS_SB(sb);
583 struct pts_mount_opts *opts = &fsi->mount_opts;
215 char s[12]; 584 char s[12];
216 585
217 /* We're supposed to be given the slave end of a pty */ 586 /* We're supposed to be given the slave end of a pty */
@@ -221,25 +590,25 @@ int devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty)
221 if (!inode) 590 if (!inode)
222 return -ENOMEM; 591 return -ENOMEM;
223 592
224 inode->i_ino = number+2; 593 inode->i_ino = number + 3;
225 inode->i_uid = config.setuid ? config.uid : current_fsuid(); 594 inode->i_uid = opts->setuid ? opts->uid : current_fsuid();
226 inode->i_gid = config.setgid ? config.gid : current_fsgid(); 595 inode->i_gid = opts->setgid ? opts->gid : current_fsgid();
227 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 596 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
228 init_special_inode(inode, S_IFCHR|config.mode, device); 597 init_special_inode(inode, S_IFCHR|opts->mode, device);
229 inode->i_private = tty; 598 inode->i_private = tty;
230 tty->driver_data = inode; 599 tty->driver_data = inode;
231 600
232 sprintf(s, "%d", number); 601 sprintf(s, "%d", number);
233 602
234 mutex_lock(&devpts_root->d_inode->i_mutex); 603 mutex_lock(&root->d_inode->i_mutex);
235 604
236 dentry = d_alloc_name(devpts_root, s); 605 dentry = d_alloc_name(root, s);
237 if (!IS_ERR(dentry)) { 606 if (!IS_ERR(dentry)) {
238 d_add(dentry, inode); 607 d_add(dentry, inode);
239 fsnotify_create(devpts_root->d_inode, dentry); 608 fsnotify_create(root->d_inode, dentry);
240 } 609 }
241 610
242 mutex_unlock(&devpts_root->d_inode->i_mutex); 611 mutex_unlock(&root->d_inode->i_mutex);
243 612
244 return 0; 613 return 0;
245} 614}
@@ -256,20 +625,27 @@ struct tty_struct *devpts_get_tty(struct inode *pts_inode, int number)
256void devpts_pty_kill(struct tty_struct *tty) 625void devpts_pty_kill(struct tty_struct *tty)
257{ 626{
258 struct inode *inode = tty->driver_data; 627 struct inode *inode = tty->driver_data;
628 struct super_block *sb = pts_sb_from_inode(inode);
629 struct dentry *root = sb->s_root;
259 struct dentry *dentry; 630 struct dentry *dentry;
260 631
261 BUG_ON(inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR)); 632 BUG_ON(inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR));
262 633
263 mutex_lock(&devpts_root->d_inode->i_mutex); 634 mutex_lock(&root->d_inode->i_mutex);
264 635
265 dentry = d_find_alias(inode); 636 dentry = d_find_alias(inode);
266 if (dentry && !IS_ERR(dentry)) { 637 if (IS_ERR(dentry))
638 goto out;
639
640 if (dentry) {
267 inode->i_nlink--; 641 inode->i_nlink--;
268 d_delete(dentry); 642 d_delete(dentry);
269 dput(dentry); 643 dput(dentry); /* d_alloc_name() in devpts_pty_new() */
270 } 644 }
271 645
272 mutex_unlock(&devpts_root->d_inode->i_mutex); 646 dput(dentry); /* d_find_alias above */
647out:
648 mutex_unlock(&root->d_inode->i_mutex);
273} 649}
274 650
275static int __init init_devpts_fs(void) 651static int __init init_devpts_fs(void)
diff --git a/fs/direct-io.c b/fs/direct-io.c
index af0558dbe8b7..b6d43908ff7a 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1209,6 +1209,19 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1209 retval = direct_io_worker(rw, iocb, inode, iov, offset, 1209 retval = direct_io_worker(rw, iocb, inode, iov, offset,
1210 nr_segs, blkbits, get_block, end_io, dio); 1210 nr_segs, blkbits, get_block, end_io, dio);
1211 1211
1212 /*
1213 * In case of error extending write may have instantiated a few
1214 * blocks outside i_size. Trim these off again for DIO_LOCKING.
1215 * NOTE: DIO_NO_LOCK/DIO_OWN_LOCK callers have to handle this by
1216 * it's own meaner.
1217 */
1218 if (unlikely(retval < 0 && (rw & WRITE))) {
1219 loff_t isize = i_size_read(inode);
1220
1221 if (end > isize && dio_lock_type == DIO_LOCKING)
1222 vmtruncate(inode, isize);
1223 }
1224
1212 if (rw == READ && dio_lock_type == DIO_LOCKING) 1225 if (rw == READ && dio_lock_type == DIO_LOCKING)
1213 release_i_mutex = 0; 1226 release_i_mutex = 0;
1214 1227
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index 8bf31e3fbf01..dc2ad6008b2d 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -2,7 +2,7 @@
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. 5** Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
6** 6**
7** This copyrighted material is made available to anyone wishing to use, 7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions 8** modify, copy, or redistribute it subject to the terms and conditions
@@ -33,10 +33,10 @@ void dlm_del_ast(struct dlm_lkb *lkb)
33 spin_unlock(&ast_queue_lock); 33 spin_unlock(&ast_queue_lock);
34} 34}
35 35
36void dlm_add_ast(struct dlm_lkb *lkb, int type) 36void dlm_add_ast(struct dlm_lkb *lkb, int type, int bastmode)
37{ 37{
38 if (lkb->lkb_flags & DLM_IFL_USER) { 38 if (lkb->lkb_flags & DLM_IFL_USER) {
39 dlm_user_add_ast(lkb, type); 39 dlm_user_add_ast(lkb, type, bastmode);
40 return; 40 return;
41 } 41 }
42 42
@@ -46,6 +46,8 @@ void dlm_add_ast(struct dlm_lkb *lkb, int type)
46 list_add_tail(&lkb->lkb_astqueue, &ast_queue); 46 list_add_tail(&lkb->lkb_astqueue, &ast_queue);
47 } 47 }
48 lkb->lkb_ast_type |= type; 48 lkb->lkb_ast_type |= type;
49 if (bastmode)
50 lkb->lkb_bastmode = bastmode;
49 spin_unlock(&ast_queue_lock); 51 spin_unlock(&ast_queue_lock);
50 52
51 set_bit(WAKE_ASTS, &astd_wakeflags); 53 set_bit(WAKE_ASTS, &astd_wakeflags);
@@ -59,50 +61,40 @@ static void process_asts(void)
59 struct dlm_lkb *lkb; 61 struct dlm_lkb *lkb;
60 void (*cast) (void *astparam); 62 void (*cast) (void *astparam);
61 void (*bast) (void *astparam, int mode); 63 void (*bast) (void *astparam, int mode);
62 int type = 0, found, bmode; 64 int type = 0, bastmode;
63 65
64 for (;;) { 66repeat:
65 found = 0; 67 spin_lock(&ast_queue_lock);
66 spin_lock(&ast_queue_lock); 68 list_for_each_entry(lkb, &ast_queue, lkb_astqueue) {
67 list_for_each_entry(lkb, &ast_queue, lkb_astqueue) { 69 r = lkb->lkb_resource;
68 r = lkb->lkb_resource; 70 ls = r->res_ls;
69 ls = r->res_ls; 71
70 72 if (dlm_locking_stopped(ls))
71 if (dlm_locking_stopped(ls)) 73 continue;
72 continue;
73
74 list_del(&lkb->lkb_astqueue);
75 type = lkb->lkb_ast_type;
76 lkb->lkb_ast_type = 0;
77 found = 1;
78 break;
79 }
80 spin_unlock(&ast_queue_lock);
81 74
82 if (!found) 75 list_del(&lkb->lkb_astqueue);
83 break; 76 type = lkb->lkb_ast_type;
77 lkb->lkb_ast_type = 0;
78 bastmode = lkb->lkb_bastmode;
84 79
80 spin_unlock(&ast_queue_lock);
85 cast = lkb->lkb_astfn; 81 cast = lkb->lkb_astfn;
86 bast = lkb->lkb_bastfn; 82 bast = lkb->lkb_bastfn;
87 bmode = lkb->lkb_bastmode;
88 83
89 if ((type & AST_COMP) && cast) 84 if ((type & AST_COMP) && cast)
90 cast(lkb->lkb_astparam); 85 cast(lkb->lkb_astparam);
91 86
92 /* FIXME: Is it safe to look at lkb_grmode here
93 without doing a lock_rsb() ?
94 Look at other checks in v1 to avoid basts. */
95
96 if ((type & AST_BAST) && bast) 87 if ((type & AST_BAST) && bast)
97 if (!dlm_modes_compat(lkb->lkb_grmode, bmode)) 88 bast(lkb->lkb_astparam, bastmode);
98 bast(lkb->lkb_astparam, bmode);
99 89
100 /* this removes the reference added by dlm_add_ast 90 /* this removes the reference added by dlm_add_ast
101 and may result in the lkb being freed */ 91 and may result in the lkb being freed */
102 dlm_put_lkb(lkb); 92 dlm_put_lkb(lkb);
103 93
104 schedule(); 94 cond_resched();
95 goto repeat;
105 } 96 }
97 spin_unlock(&ast_queue_lock);
106} 98}
107 99
108static inline int no_asts(void) 100static inline int no_asts(void)
diff --git a/fs/dlm/ast.h b/fs/dlm/ast.h
index 6ee276c74c52..1b5fc5f428fd 100644
--- a/fs/dlm/ast.h
+++ b/fs/dlm/ast.h
@@ -1,7 +1,7 @@
1/****************************************************************************** 1/******************************************************************************
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved. 4** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved.
5** 5**
6** This copyrighted material is made available to anyone wishing to use, 6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions 7** modify, copy, or redistribute it subject to the terms and conditions
@@ -13,7 +13,7 @@
13#ifndef __ASTD_DOT_H__ 13#ifndef __ASTD_DOT_H__
14#define __ASTD_DOT_H__ 14#define __ASTD_DOT_H__
15 15
16void dlm_add_ast(struct dlm_lkb *lkb, int type); 16void dlm_add_ast(struct dlm_lkb *lkb, int type, int bastmode);
17void dlm_del_ast(struct dlm_lkb *lkb); 17void dlm_del_ast(struct dlm_lkb *lkb);
18 18
19void dlm_astd_wake(void); 19void dlm_astd_wake(void);
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 8fc24f4507a3..1d1d27442235 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -1,7 +1,7 @@
1/****************************************************************************** 1/******************************************************************************
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) 2005 Red Hat, Inc. All rights reserved. 4** Copyright (C) 2005-2009 Red Hat, Inc. All rights reserved.
5** 5**
6** This copyrighted material is made available to anyone wishing to use, 6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions 7** modify, copy, or redistribute it subject to the terms and conditions
@@ -25,19 +25,6 @@ static struct mutex debug_buf_lock;
25 25
26static struct dentry *dlm_root; 26static struct dentry *dlm_root;
27 27
28struct rsb_iter {
29 int entry;
30 int locks;
31 int header;
32 struct dlm_ls *ls;
33 struct list_head *next;
34 struct dlm_rsb *rsb;
35};
36
37/*
38 * dump all rsb's in the lockspace hash table
39 */
40
41static char *print_lockmode(int mode) 28static char *print_lockmode(int mode)
42{ 29{
43 switch (mode) { 30 switch (mode) {
@@ -60,13 +47,13 @@ static char *print_lockmode(int mode)
60 } 47 }
61} 48}
62 49
63static void print_resource_lock(struct seq_file *s, struct dlm_lkb *lkb, 50static int print_format1_lock(struct seq_file *s, struct dlm_lkb *lkb,
64 struct dlm_rsb *res) 51 struct dlm_rsb *res)
65{ 52{
66 seq_printf(s, "%08x %s", lkb->lkb_id, print_lockmode(lkb->lkb_grmode)); 53 seq_printf(s, "%08x %s", lkb->lkb_id, print_lockmode(lkb->lkb_grmode));
67 54
68 if (lkb->lkb_status == DLM_LKSTS_CONVERT 55 if (lkb->lkb_status == DLM_LKSTS_CONVERT ||
69 || lkb->lkb_status == DLM_LKSTS_WAITING) 56 lkb->lkb_status == DLM_LKSTS_WAITING)
70 seq_printf(s, " (%s)", print_lockmode(lkb->lkb_rqmode)); 57 seq_printf(s, " (%s)", print_lockmode(lkb->lkb_rqmode));
71 58
72 if (lkb->lkb_nodeid) { 59 if (lkb->lkb_nodeid) {
@@ -80,33 +67,42 @@ static void print_resource_lock(struct seq_file *s, struct dlm_lkb *lkb,
80 if (lkb->lkb_wait_type) 67 if (lkb->lkb_wait_type)
81 seq_printf(s, " wait_type: %d", lkb->lkb_wait_type); 68 seq_printf(s, " wait_type: %d", lkb->lkb_wait_type);
82 69
83 seq_printf(s, "\n"); 70 return seq_printf(s, "\n");
84} 71}
85 72
86static int print_resource(struct dlm_rsb *res, struct seq_file *s) 73static int print_format1(struct dlm_rsb *res, struct seq_file *s)
87{ 74{
88 struct dlm_lkb *lkb; 75 struct dlm_lkb *lkb;
89 int i, lvblen = res->res_ls->ls_lvblen, recover_list, root_list; 76 int i, lvblen = res->res_ls->ls_lvblen, recover_list, root_list;
77 int rv;
90 78
91 lock_rsb(res); 79 lock_rsb(res);
92 80
93 seq_printf(s, "\nResource %p Name (len=%d) \"", res, res->res_length); 81 rv = seq_printf(s, "\nResource %p Name (len=%d) \"",
82 res, res->res_length);
83 if (rv)
84 goto out;
85
94 for (i = 0; i < res->res_length; i++) { 86 for (i = 0; i < res->res_length; i++) {
95 if (isprint(res->res_name[i])) 87 if (isprint(res->res_name[i]))
96 seq_printf(s, "%c", res->res_name[i]); 88 seq_printf(s, "%c", res->res_name[i]);
97 else 89 else
98 seq_printf(s, "%c", '.'); 90 seq_printf(s, "%c", '.');
99 } 91 }
92
100 if (res->res_nodeid > 0) 93 if (res->res_nodeid > 0)
101 seq_printf(s, "\" \nLocal Copy, Master is node %d\n", 94 rv = seq_printf(s, "\" \nLocal Copy, Master is node %d\n",
102 res->res_nodeid); 95 res->res_nodeid);
103 else if (res->res_nodeid == 0) 96 else if (res->res_nodeid == 0)
104 seq_printf(s, "\" \nMaster Copy\n"); 97 rv = seq_printf(s, "\" \nMaster Copy\n");
105 else if (res->res_nodeid == -1) 98 else if (res->res_nodeid == -1)
106 seq_printf(s, "\" \nLooking up master (lkid %x)\n", 99 rv = seq_printf(s, "\" \nLooking up master (lkid %x)\n",
107 res->res_first_lkid); 100 res->res_first_lkid);
108 else 101 else
109 seq_printf(s, "\" \nInvalid master %d\n", res->res_nodeid); 102 rv = seq_printf(s, "\" \nInvalid master %d\n",
103 res->res_nodeid);
104 if (rv)
105 goto out;
110 106
111 /* Print the LVB: */ 107 /* Print the LVB: */
112 if (res->res_lvbptr) { 108 if (res->res_lvbptr) {
@@ -119,329 +115,489 @@ static int print_resource(struct dlm_rsb *res, struct seq_file *s)
119 } 115 }
120 if (rsb_flag(res, RSB_VALNOTVALID)) 116 if (rsb_flag(res, RSB_VALNOTVALID))
121 seq_printf(s, " (INVALID)"); 117 seq_printf(s, " (INVALID)");
122 seq_printf(s, "\n"); 118 rv = seq_printf(s, "\n");
119 if (rv)
120 goto out;
123 } 121 }
124 122
125 root_list = !list_empty(&res->res_root_list); 123 root_list = !list_empty(&res->res_root_list);
126 recover_list = !list_empty(&res->res_recover_list); 124 recover_list = !list_empty(&res->res_recover_list);
127 125
128 if (root_list || recover_list) { 126 if (root_list || recover_list) {
129 seq_printf(s, "Recovery: root %d recover %d flags %lx " 127 rv = seq_printf(s, "Recovery: root %d recover %d flags %lx "
130 "count %d\n", root_list, recover_list, 128 "count %d\n", root_list, recover_list,
131 res->res_flags, res->res_recover_locks_count); 129 res->res_flags, res->res_recover_locks_count);
130 if (rv)
131 goto out;
132 } 132 }
133 133
134 /* Print the locks attached to this resource */ 134 /* Print the locks attached to this resource */
135 seq_printf(s, "Granted Queue\n"); 135 seq_printf(s, "Granted Queue\n");
136 list_for_each_entry(lkb, &res->res_grantqueue, lkb_statequeue) 136 list_for_each_entry(lkb, &res->res_grantqueue, lkb_statequeue) {
137 print_resource_lock(s, lkb, res); 137 rv = print_format1_lock(s, lkb, res);
138 if (rv)
139 goto out;
140 }
138 141
139 seq_printf(s, "Conversion Queue\n"); 142 seq_printf(s, "Conversion Queue\n");
140 list_for_each_entry(lkb, &res->res_convertqueue, lkb_statequeue) 143 list_for_each_entry(lkb, &res->res_convertqueue, lkb_statequeue) {
141 print_resource_lock(s, lkb, res); 144 rv = print_format1_lock(s, lkb, res);
145 if (rv)
146 goto out;
147 }
142 148
143 seq_printf(s, "Waiting Queue\n"); 149 seq_printf(s, "Waiting Queue\n");
144 list_for_each_entry(lkb, &res->res_waitqueue, lkb_statequeue) 150 list_for_each_entry(lkb, &res->res_waitqueue, lkb_statequeue) {
145 print_resource_lock(s, lkb, res); 151 rv = print_format1_lock(s, lkb, res);
152 if (rv)
153 goto out;
154 }
146 155
147 if (list_empty(&res->res_lookup)) 156 if (list_empty(&res->res_lookup))
148 goto out; 157 goto out;
149 158
150 seq_printf(s, "Lookup Queue\n"); 159 seq_printf(s, "Lookup Queue\n");
151 list_for_each_entry(lkb, &res->res_lookup, lkb_rsb_lookup) { 160 list_for_each_entry(lkb, &res->res_lookup, lkb_rsb_lookup) {
152 seq_printf(s, "%08x %s", lkb->lkb_id, 161 rv = seq_printf(s, "%08x %s", lkb->lkb_id,
153 print_lockmode(lkb->lkb_rqmode)); 162 print_lockmode(lkb->lkb_rqmode));
154 if (lkb->lkb_wait_type) 163 if (lkb->lkb_wait_type)
155 seq_printf(s, " wait_type: %d", lkb->lkb_wait_type); 164 seq_printf(s, " wait_type: %d", lkb->lkb_wait_type);
156 seq_printf(s, "\n"); 165 rv = seq_printf(s, "\n");
157 } 166 }
158 out: 167 out:
159 unlock_rsb(res); 168 unlock_rsb(res);
160 return 0; 169 return rv;
161} 170}
162 171
163static void print_lock(struct seq_file *s, struct dlm_lkb *lkb, struct dlm_rsb *r) 172static int print_format2_lock(struct seq_file *s, struct dlm_lkb *lkb,
173 struct dlm_rsb *r)
164{ 174{
165 unsigned int waiting = 0; 175 u64 xid = 0;
166 uint64_t xid = 0; 176 u64 us;
177 int rv;
167 178
168 if (lkb->lkb_flags & DLM_IFL_USER) { 179 if (lkb->lkb_flags & DLM_IFL_USER) {
169 if (lkb->lkb_ua) 180 if (lkb->lkb_ua)
170 xid = lkb->lkb_ua->xid; 181 xid = lkb->lkb_ua->xid;
171 } 182 }
172 183
173 if (lkb->lkb_timestamp) 184 /* microseconds since lkb was added to current queue */
174 waiting = jiffies_to_msecs(jiffies - lkb->lkb_timestamp); 185 us = ktime_to_us(ktime_sub(ktime_get(), lkb->lkb_timestamp));
175 186
176 /* id nodeid remid pid xid exflags flags sts grmode rqmode time_ms 187 /* id nodeid remid pid xid exflags flags sts grmode rqmode time_us
177 r_nodeid r_len r_name */ 188 r_nodeid r_len r_name */
178 189
179 seq_printf(s, "%x %d %x %u %llu %x %x %d %d %d %u %u %d \"%s\"\n", 190 rv = seq_printf(s, "%x %d %x %u %llu %x %x %d %d %d %llu %u %d \"%s\"\n",
180 lkb->lkb_id, 191 lkb->lkb_id,
181 lkb->lkb_nodeid, 192 lkb->lkb_nodeid,
182 lkb->lkb_remid, 193 lkb->lkb_remid,
183 lkb->lkb_ownpid, 194 lkb->lkb_ownpid,
184 (unsigned long long)xid, 195 (unsigned long long)xid,
185 lkb->lkb_exflags, 196 lkb->lkb_exflags,
186 lkb->lkb_flags, 197 lkb->lkb_flags,
187 lkb->lkb_status, 198 lkb->lkb_status,
188 lkb->lkb_grmode, 199 lkb->lkb_grmode,
189 lkb->lkb_rqmode, 200 lkb->lkb_rqmode,
190 waiting, 201 (unsigned long long)us,
191 r->res_nodeid, 202 r->res_nodeid,
192 r->res_length, 203 r->res_length,
193 r->res_name); 204 r->res_name);
205 return rv;
194} 206}
195 207
196static int print_locks(struct dlm_rsb *r, struct seq_file *s) 208static int print_format2(struct dlm_rsb *r, struct seq_file *s)
197{ 209{
198 struct dlm_lkb *lkb; 210 struct dlm_lkb *lkb;
211 int rv = 0;
199 212
200 lock_rsb(r); 213 lock_rsb(r);
201 214
202 list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) 215 list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
203 print_lock(s, lkb, r); 216 rv = print_format2_lock(s, lkb, r);
204 217 if (rv)
205 list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) 218 goto out;
206 print_lock(s, lkb, r); 219 }
207 220
208 list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue) 221 list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) {
209 print_lock(s, lkb, r); 222 rv = print_format2_lock(s, lkb, r);
223 if (rv)
224 goto out;
225 }
210 226
227 list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue) {
228 rv = print_format2_lock(s, lkb, r);
229 if (rv)
230 goto out;
231 }
232 out:
211 unlock_rsb(r); 233 unlock_rsb(r);
212 return 0; 234 return rv;
213} 235}
214 236
215static int rsb_iter_next(struct rsb_iter *ri) 237static int print_format3_lock(struct seq_file *s, struct dlm_lkb *lkb,
238 int rsb_lookup)
216{ 239{
217 struct dlm_ls *ls = ri->ls; 240 u64 xid = 0;
218 int i; 241 int rv;
219
220 if (!ri->next) {
221 top:
222 /* Find the next non-empty hash bucket */
223 for (i = ri->entry; i < ls->ls_rsbtbl_size; i++) {
224 read_lock(&ls->ls_rsbtbl[i].lock);
225 if (!list_empty(&ls->ls_rsbtbl[i].list)) {
226 ri->next = ls->ls_rsbtbl[i].list.next;
227 ri->rsb = list_entry(ri->next, struct dlm_rsb,
228 res_hashchain);
229 dlm_hold_rsb(ri->rsb);
230 read_unlock(&ls->ls_rsbtbl[i].lock);
231 break;
232 }
233 read_unlock(&ls->ls_rsbtbl[i].lock);
234 }
235 ri->entry = i;
236
237 if (ri->entry >= ls->ls_rsbtbl_size)
238 return 1;
239 } else {
240 struct dlm_rsb *old = ri->rsb;
241 i = ri->entry;
242 read_lock(&ls->ls_rsbtbl[i].lock);
243 ri->next = ri->next->next;
244 if (ri->next->next == ls->ls_rsbtbl[i].list.next) {
245 /* End of list - move to next bucket */
246 ri->next = NULL;
247 ri->entry++;
248 read_unlock(&ls->ls_rsbtbl[i].lock);
249 dlm_put_rsb(old);
250 goto top;
251 }
252 ri->rsb = list_entry(ri->next, struct dlm_rsb, res_hashchain);
253 dlm_hold_rsb(ri->rsb);
254 read_unlock(&ls->ls_rsbtbl[i].lock);
255 dlm_put_rsb(old);
256 }
257 242
258 return 0; 243 if (lkb->lkb_flags & DLM_IFL_USER) {
259} 244 if (lkb->lkb_ua)
245 xid = lkb->lkb_ua->xid;
246 }
260 247
261static void rsb_iter_free(struct rsb_iter *ri) 248 rv = seq_printf(s, "lkb %x %d %x %u %llu %x %x %d %d %d %d %d %d %u %llu %llu\n",
262{ 249 lkb->lkb_id,
263 kfree(ri); 250 lkb->lkb_nodeid,
251 lkb->lkb_remid,
252 lkb->lkb_ownpid,
253 (unsigned long long)xid,
254 lkb->lkb_exflags,
255 lkb->lkb_flags,
256 lkb->lkb_status,
257 lkb->lkb_grmode,
258 lkb->lkb_rqmode,
259 lkb->lkb_highbast,
260 rsb_lookup,
261 lkb->lkb_wait_type,
262 lkb->lkb_lvbseq,
263 (unsigned long long)ktime_to_ns(lkb->lkb_timestamp),
264 (unsigned long long)ktime_to_ns(lkb->lkb_time_bast));
265 return rv;
264} 266}
265 267
266static struct rsb_iter *rsb_iter_init(struct dlm_ls *ls) 268static int print_format3(struct dlm_rsb *r, struct seq_file *s)
267{ 269{
268 struct rsb_iter *ri; 270 struct dlm_lkb *lkb;
271 int i, lvblen = r->res_ls->ls_lvblen;
272 int print_name = 1;
273 int rv;
269 274
270 ri = kzalloc(sizeof *ri, GFP_KERNEL); 275 lock_rsb(r);
271 if (!ri)
272 return NULL;
273 276
274 ri->ls = ls; 277 rv = seq_printf(s, "rsb %p %d %x %lx %d %d %u %d ",
275 ri->entry = 0; 278 r,
276 ri->next = NULL; 279 r->res_nodeid,
280 r->res_first_lkid,
281 r->res_flags,
282 !list_empty(&r->res_root_list),
283 !list_empty(&r->res_recover_list),
284 r->res_recover_locks_count,
285 r->res_length);
286 if (rv)
287 goto out;
277 288
278 if (rsb_iter_next(ri)) { 289 for (i = 0; i < r->res_length; i++) {
279 rsb_iter_free(ri); 290 if (!isascii(r->res_name[i]) || !isprint(r->res_name[i]))
280 return NULL; 291 print_name = 0;
281 } 292 }
282 293
283 return ri; 294 seq_printf(s, "%s", print_name ? "str " : "hex");
284}
285 295
286static void *rsb_seq_start(struct seq_file *file, loff_t *pos) 296 for (i = 0; i < r->res_length; i++) {
287{ 297 if (print_name)
288 struct rsb_iter *ri; 298 seq_printf(s, "%c", r->res_name[i]);
289 loff_t n = *pos; 299 else
300 seq_printf(s, " %02x", (unsigned char)r->res_name[i]);
301 }
302 rv = seq_printf(s, "\n");
303 if (rv)
304 goto out;
290 305
291 ri = rsb_iter_init(file->private); 306 if (!r->res_lvbptr)
292 if (!ri) 307 goto do_locks;
293 return NULL;
294 308
295 while (n--) { 309 seq_printf(s, "lvb %u %d", r->res_lvbseq, lvblen);
296 if (rsb_iter_next(ri)) {
297 rsb_iter_free(ri);
298 return NULL;
299 }
300 }
301 310
302 return ri; 311 for (i = 0; i < lvblen; i++)
303} 312 seq_printf(s, " %02x", (unsigned char)r->res_lvbptr[i]);
313 rv = seq_printf(s, "\n");
314 if (rv)
315 goto out;
304 316
305static void *rsb_seq_next(struct seq_file *file, void *iter_ptr, loff_t *pos) 317 do_locks:
306{ 318 list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
307 struct rsb_iter *ri = iter_ptr; 319 rv = print_format3_lock(s, lkb, 0);
320 if (rv)
321 goto out;
322 }
308 323
309 (*pos)++; 324 list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) {
325 rv = print_format3_lock(s, lkb, 0);
326 if (rv)
327 goto out;
328 }
310 329
311 if (rsb_iter_next(ri)) { 330 list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue) {
312 rsb_iter_free(ri); 331 rv = print_format3_lock(s, lkb, 0);
313 return NULL; 332 if (rv)
333 goto out;
314 } 334 }
315 335
316 return ri; 336 list_for_each_entry(lkb, &r->res_lookup, lkb_rsb_lookup) {
337 rv = print_format3_lock(s, lkb, 1);
338 if (rv)
339 goto out;
340 }
341 out:
342 unlock_rsb(r);
343 return rv;
317} 344}
318 345
319static void rsb_seq_stop(struct seq_file *file, void *iter_ptr) 346struct rsbtbl_iter {
320{ 347 struct dlm_rsb *rsb;
321 /* nothing for now */ 348 unsigned bucket;
322} 349 int format;
350 int header;
351};
323 352
324static int rsb_seq_show(struct seq_file *file, void *iter_ptr) 353/* seq_printf returns -1 if the buffer is full, and 0 otherwise.
325{ 354 If the buffer is full, seq_printf can be called again, but it
326 struct rsb_iter *ri = iter_ptr; 355 does nothing and just returns -1. So, the these printing routines
356 periodically check the return value to avoid wasting too much time
357 trying to print to a full buffer. */
327 358
328 if (ri->locks) { 359static int table_seq_show(struct seq_file *seq, void *iter_ptr)
360{
361 struct rsbtbl_iter *ri = iter_ptr;
362 int rv = 0;
363
364 switch (ri->format) {
365 case 1:
366 rv = print_format1(ri->rsb, seq);
367 break;
368 case 2:
329 if (ri->header) { 369 if (ri->header) {
330 seq_printf(file, "id nodeid remid pid xid exflags flags " 370 seq_printf(seq, "id nodeid remid pid xid exflags "
331 "sts grmode rqmode time_ms r_nodeid " 371 "flags sts grmode rqmode time_ms "
332 "r_len r_name\n"); 372 "r_nodeid r_len r_name\n");
333 ri->header = 0; 373 ri->header = 0;
334 } 374 }
335 print_locks(ri->rsb, file); 375 rv = print_format2(ri->rsb, seq);
336 } else { 376 break;
337 print_resource(ri->rsb, file); 377 case 3:
378 if (ri->header) {
379 seq_printf(seq, "version rsb 1.1 lvb 1.1 lkb 1.1\n");
380 ri->header = 0;
381 }
382 rv = print_format3(ri->rsb, seq);
383 break;
338 } 384 }
339 385
340 return 0; 386 return rv;
341} 387}
342 388
343static struct seq_operations rsb_seq_ops = { 389static struct seq_operations format1_seq_ops;
344 .start = rsb_seq_start, 390static struct seq_operations format2_seq_ops;
345 .next = rsb_seq_next, 391static struct seq_operations format3_seq_ops;
346 .stop = rsb_seq_stop,
347 .show = rsb_seq_show,
348};
349 392
350static int rsb_open(struct inode *inode, struct file *file) 393static void *table_seq_start(struct seq_file *seq, loff_t *pos)
351{ 394{
352 struct seq_file *seq; 395 struct dlm_ls *ls = seq->private;
353 int ret; 396 struct rsbtbl_iter *ri;
354 397 struct dlm_rsb *r;
355 ret = seq_open(file, &rsb_seq_ops); 398 loff_t n = *pos;
356 if (ret) 399 unsigned bucket, entry;
357 return ret;
358
359 seq = file->private_data;
360 seq->private = inode->i_private;
361
362 return 0;
363}
364
365static const struct file_operations rsb_fops = {
366 .owner = THIS_MODULE,
367 .open = rsb_open,
368 .read = seq_read,
369 .llseek = seq_lseek,
370 .release = seq_release
371};
372 400
373/* 401 bucket = n >> 32;
374 * Dump state in compact per-lock listing 402 entry = n & ((1LL << 32) - 1);
375 */
376 403
377static struct rsb_iter *locks_iter_init(struct dlm_ls *ls, loff_t *pos) 404 if (bucket >= ls->ls_rsbtbl_size)
378{ 405 return NULL;
379 struct rsb_iter *ri;
380 406
381 ri = kzalloc(sizeof *ri, GFP_KERNEL); 407 ri = kzalloc(sizeof(struct rsbtbl_iter), GFP_KERNEL);
382 if (!ri) 408 if (!ri)
383 return NULL; 409 return NULL;
410 if (n == 0)
411 ri->header = 1;
412 if (seq->op == &format1_seq_ops)
413 ri->format = 1;
414 if (seq->op == &format2_seq_ops)
415 ri->format = 2;
416 if (seq->op == &format3_seq_ops)
417 ri->format = 3;
418
419 spin_lock(&ls->ls_rsbtbl[bucket].lock);
420 if (!list_empty(&ls->ls_rsbtbl[bucket].list)) {
421 list_for_each_entry(r, &ls->ls_rsbtbl[bucket].list,
422 res_hashchain) {
423 if (!entry--) {
424 dlm_hold_rsb(r);
425 ri->rsb = r;
426 ri->bucket = bucket;
427 spin_unlock(&ls->ls_rsbtbl[bucket].lock);
428 return ri;
429 }
430 }
431 }
432 spin_unlock(&ls->ls_rsbtbl[bucket].lock);
384 433
385 ri->ls = ls; 434 /*
386 ri->entry = 0; 435 * move to the first rsb in the next non-empty bucket
387 ri->next = NULL; 436 */
388 ri->locks = 1;
389 437
390 if (*pos == 0) 438 /* zero the entry */
391 ri->header = 1; 439 n &= ~((1LL << 32) - 1);
392 440
393 if (rsb_iter_next(ri)) { 441 while (1) {
394 rsb_iter_free(ri); 442 bucket++;
395 return NULL; 443 n += 1LL << 32;
396 } 444
445 if (bucket >= ls->ls_rsbtbl_size) {
446 kfree(ri);
447 return NULL;
448 }
397 449
398 return ri; 450 spin_lock(&ls->ls_rsbtbl[bucket].lock);
451 if (!list_empty(&ls->ls_rsbtbl[bucket].list)) {
452 r = list_first_entry(&ls->ls_rsbtbl[bucket].list,
453 struct dlm_rsb, res_hashchain);
454 dlm_hold_rsb(r);
455 ri->rsb = r;
456 ri->bucket = bucket;
457 spin_unlock(&ls->ls_rsbtbl[bucket].lock);
458 *pos = n;
459 return ri;
460 }
461 spin_unlock(&ls->ls_rsbtbl[bucket].lock);
462 }
399} 463}
400 464
401static void *locks_seq_start(struct seq_file *file, loff_t *pos) 465static void *table_seq_next(struct seq_file *seq, void *iter_ptr, loff_t *pos)
402{ 466{
403 struct rsb_iter *ri; 467 struct dlm_ls *ls = seq->private;
468 struct rsbtbl_iter *ri = iter_ptr;
469 struct list_head *next;
470 struct dlm_rsb *r, *rp;
404 loff_t n = *pos; 471 loff_t n = *pos;
472 unsigned bucket;
473
474 bucket = n >> 32;
475
476 /*
477 * move to the next rsb in the same bucket
478 */
479
480 spin_lock(&ls->ls_rsbtbl[bucket].lock);
481 rp = ri->rsb;
482 next = rp->res_hashchain.next;
483
484 if (next != &ls->ls_rsbtbl[bucket].list) {
485 r = list_entry(next, struct dlm_rsb, res_hashchain);
486 dlm_hold_rsb(r);
487 ri->rsb = r;
488 spin_unlock(&ls->ls_rsbtbl[bucket].lock);
489 dlm_put_rsb(rp);
490 ++*pos;
491 return ri;
492 }
493 spin_unlock(&ls->ls_rsbtbl[bucket].lock);
494 dlm_put_rsb(rp);
405 495
406 ri = locks_iter_init(file->private, pos); 496 /*
407 if (!ri) 497 * move to the first rsb in the next non-empty bucket
408 return NULL; 498 */
499
500 /* zero the entry */
501 n &= ~((1LL << 32) - 1);
409 502
410 while (n--) { 503 while (1) {
411 if (rsb_iter_next(ri)) { 504 bucket++;
412 rsb_iter_free(ri); 505 n += 1LL << 32;
506
507 if (bucket >= ls->ls_rsbtbl_size) {
508 kfree(ri);
413 return NULL; 509 return NULL;
414 } 510 }
511
512 spin_lock(&ls->ls_rsbtbl[bucket].lock);
513 if (!list_empty(&ls->ls_rsbtbl[bucket].list)) {
514 r = list_first_entry(&ls->ls_rsbtbl[bucket].list,
515 struct dlm_rsb, res_hashchain);
516 dlm_hold_rsb(r);
517 ri->rsb = r;
518 ri->bucket = bucket;
519 spin_unlock(&ls->ls_rsbtbl[bucket].lock);
520 *pos = n;
521 return ri;
522 }
523 spin_unlock(&ls->ls_rsbtbl[bucket].lock);
415 } 524 }
525}
526
527static void table_seq_stop(struct seq_file *seq, void *iter_ptr)
528{
529 struct rsbtbl_iter *ri = iter_ptr;
416 530
417 return ri; 531 if (ri) {
532 dlm_put_rsb(ri->rsb);
533 kfree(ri);
534 }
418} 535}
419 536
420static struct seq_operations locks_seq_ops = { 537static struct seq_operations format1_seq_ops = {
421 .start = locks_seq_start, 538 .start = table_seq_start,
422 .next = rsb_seq_next, 539 .next = table_seq_next,
423 .stop = rsb_seq_stop, 540 .stop = table_seq_stop,
424 .show = rsb_seq_show, 541 .show = table_seq_show,
542};
543
544static struct seq_operations format2_seq_ops = {
545 .start = table_seq_start,
546 .next = table_seq_next,
547 .stop = table_seq_stop,
548 .show = table_seq_show,
425}; 549};
426 550
427static int locks_open(struct inode *inode, struct file *file) 551static struct seq_operations format3_seq_ops = {
552 .start = table_seq_start,
553 .next = table_seq_next,
554 .stop = table_seq_stop,
555 .show = table_seq_show,
556};
557
558static const struct file_operations format1_fops;
559static const struct file_operations format2_fops;
560static const struct file_operations format3_fops;
561
562static int table_open(struct inode *inode, struct file *file)
428{ 563{
429 struct seq_file *seq; 564 struct seq_file *seq;
430 int ret; 565 int ret = -1;
566
567 if (file->f_op == &format1_fops)
568 ret = seq_open(file, &format1_seq_ops);
569 else if (file->f_op == &format2_fops)
570 ret = seq_open(file, &format2_seq_ops);
571 else if (file->f_op == &format3_fops)
572 ret = seq_open(file, &format3_seq_ops);
431 573
432 ret = seq_open(file, &locks_seq_ops);
433 if (ret) 574 if (ret)
434 return ret; 575 return ret;
435 576
436 seq = file->private_data; 577 seq = file->private_data;
437 seq->private = inode->i_private; 578 seq->private = inode->i_private; /* the dlm_ls */
438
439 return 0; 579 return 0;
440} 580}
441 581
442static const struct file_operations locks_fops = { 582static const struct file_operations format1_fops = {
443 .owner = THIS_MODULE, 583 .owner = THIS_MODULE,
444 .open = locks_open, 584 .open = table_open,
585 .read = seq_read,
586 .llseek = seq_lseek,
587 .release = seq_release
588};
589
590static const struct file_operations format2_fops = {
591 .owner = THIS_MODULE,
592 .open = table_open,
593 .read = seq_read,
594 .llseek = seq_lseek,
595 .release = seq_release
596};
597
598static const struct file_operations format3_fops = {
599 .owner = THIS_MODULE,
600 .open = table_open,
445 .read = seq_read, 601 .read = seq_read,
446 .llseek = seq_lseek, 602 .llseek = seq_lseek,
447 .release = seq_release 603 .release = seq_release
@@ -489,30 +645,33 @@ static const struct file_operations waiters_fops = {
489 .read = waiters_read 645 .read = waiters_read
490}; 646};
491 647
648void dlm_delete_debug_file(struct dlm_ls *ls)
649{
650 if (ls->ls_debug_rsb_dentry)
651 debugfs_remove(ls->ls_debug_rsb_dentry);
652 if (ls->ls_debug_waiters_dentry)
653 debugfs_remove(ls->ls_debug_waiters_dentry);
654 if (ls->ls_debug_locks_dentry)
655 debugfs_remove(ls->ls_debug_locks_dentry);
656 if (ls->ls_debug_all_dentry)
657 debugfs_remove(ls->ls_debug_all_dentry);
658}
659
492int dlm_create_debug_file(struct dlm_ls *ls) 660int dlm_create_debug_file(struct dlm_ls *ls)
493{ 661{
494 char name[DLM_LOCKSPACE_LEN+8]; 662 char name[DLM_LOCKSPACE_LEN+8];
495 663
664 /* format 1 */
665
496 ls->ls_debug_rsb_dentry = debugfs_create_file(ls->ls_name, 666 ls->ls_debug_rsb_dentry = debugfs_create_file(ls->ls_name,
497 S_IFREG | S_IRUGO, 667 S_IFREG | S_IRUGO,
498 dlm_root, 668 dlm_root,
499 ls, 669 ls,
500 &rsb_fops); 670 &format1_fops);
501 if (!ls->ls_debug_rsb_dentry) 671 if (!ls->ls_debug_rsb_dentry)
502 return -ENOMEM; 672 goto fail;
503 673
504 memset(name, 0, sizeof(name)); 674 /* format 2 */
505 snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_waiters", ls->ls_name);
506
507 ls->ls_debug_waiters_dentry = debugfs_create_file(name,
508 S_IFREG | S_IRUGO,
509 dlm_root,
510 ls,
511 &waiters_fops);
512 if (!ls->ls_debug_waiters_dentry) {
513 debugfs_remove(ls->ls_debug_rsb_dentry);
514 return -ENOMEM;
515 }
516 675
517 memset(name, 0, sizeof(name)); 676 memset(name, 0, sizeof(name));
518 snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_locks", ls->ls_name); 677 snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_locks", ls->ls_name);
@@ -521,24 +680,39 @@ int dlm_create_debug_file(struct dlm_ls *ls)
521 S_IFREG | S_IRUGO, 680 S_IFREG | S_IRUGO,
522 dlm_root, 681 dlm_root,
523 ls, 682 ls,
524 &locks_fops); 683 &format2_fops);
525 if (!ls->ls_debug_locks_dentry) { 684 if (!ls->ls_debug_locks_dentry)
526 debugfs_remove(ls->ls_debug_waiters_dentry); 685 goto fail;
527 debugfs_remove(ls->ls_debug_rsb_dentry); 686
528 return -ENOMEM; 687 /* format 3 */
529 } 688
689 memset(name, 0, sizeof(name));
690 snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_all", ls->ls_name);
691
692 ls->ls_debug_all_dentry = debugfs_create_file(name,
693 S_IFREG | S_IRUGO,
694 dlm_root,
695 ls,
696 &format3_fops);
697 if (!ls->ls_debug_all_dentry)
698 goto fail;
699
700 memset(name, 0, sizeof(name));
701 snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_waiters", ls->ls_name);
702
703 ls->ls_debug_waiters_dentry = debugfs_create_file(name,
704 S_IFREG | S_IRUGO,
705 dlm_root,
706 ls,
707 &waiters_fops);
708 if (!ls->ls_debug_waiters_dentry)
709 goto fail;
530 710
531 return 0; 711 return 0;
532}
533 712
534void dlm_delete_debug_file(struct dlm_ls *ls) 713 fail:
535{ 714 dlm_delete_debug_file(ls);
536 if (ls->ls_debug_rsb_dentry) 715 return -ENOMEM;
537 debugfs_remove(ls->ls_debug_rsb_dentry);
538 if (ls->ls_debug_waiters_dentry)
539 debugfs_remove(ls->ls_debug_waiters_dentry);
540 if (ls->ls_debug_locks_dentry)
541 debugfs_remove(ls->ls_debug_locks_dentry);
542} 716}
543 717
544int __init dlm_register_debugfs(void) 718int __init dlm_register_debugfs(void)
diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
index 85defeb64df4..92969f879a17 100644
--- a/fs/dlm/dir.c
+++ b/fs/dlm/dir.c
@@ -374,7 +374,7 @@ void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
374 struct list_head *list; 374 struct list_head *list;
375 struct dlm_rsb *r; 375 struct dlm_rsb *r;
376 int offset = 0, dir_nodeid; 376 int offset = 0, dir_nodeid;
377 uint16_t be_namelen; 377 __be16 be_namelen;
378 378
379 down_read(&ls->ls_root_sem); 379 down_read(&ls->ls_root_sem);
380 380
@@ -410,15 +410,15 @@ void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
410 410
411 if (offset + sizeof(uint16_t)*2 + r->res_length > outlen) { 411 if (offset + sizeof(uint16_t)*2 + r->res_length > outlen) {
412 /* Write end-of-block record */ 412 /* Write end-of-block record */
413 be_namelen = 0; 413 be_namelen = cpu_to_be16(0);
414 memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t)); 414 memcpy(outbuf + offset, &be_namelen, sizeof(__be16));
415 offset += sizeof(uint16_t); 415 offset += sizeof(__be16);
416 goto out; 416 goto out;
417 } 417 }
418 418
419 be_namelen = cpu_to_be16(r->res_length); 419 be_namelen = cpu_to_be16(r->res_length);
420 memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t)); 420 memcpy(outbuf + offset, &be_namelen, sizeof(__be16));
421 offset += sizeof(uint16_t); 421 offset += sizeof(__be16);
422 memcpy(outbuf + offset, r->res_name, r->res_length); 422 memcpy(outbuf + offset, r->res_name, r->res_length);
423 offset += r->res_length; 423 offset += r->res_length;
424 } 424 }
@@ -430,9 +430,9 @@ void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
430 430
431 if ((list == &ls->ls_root_list) && 431 if ((list == &ls->ls_root_list) &&
432 (offset + sizeof(uint16_t) <= outlen)) { 432 (offset + sizeof(uint16_t) <= outlen)) {
433 be_namelen = 0xFFFF; 433 be_namelen = cpu_to_be16(0xFFFF);
434 memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t)); 434 memcpy(outbuf + offset, &be_namelen, sizeof(__be16));
435 offset += sizeof(uint16_t); 435 offset += sizeof(__be16);
436 } 436 }
437 437
438 out: 438 out:
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 868e4c9ef127..076e86f38bc8 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -105,7 +105,7 @@ struct dlm_dirtable {
105struct dlm_rsbtable { 105struct dlm_rsbtable {
106 struct list_head list; 106 struct list_head list;
107 struct list_head toss; 107 struct list_head toss;
108 rwlock_t lock; 108 spinlock_t lock;
109}; 109};
110 110
111struct dlm_lkbtable { 111struct dlm_lkbtable {
@@ -245,7 +245,8 @@ struct dlm_lkb {
245 struct list_head lkb_astqueue; /* need ast to be sent */ 245 struct list_head lkb_astqueue; /* need ast to be sent */
246 struct list_head lkb_ownqueue; /* list of locks for a process */ 246 struct list_head lkb_ownqueue; /* list of locks for a process */
247 struct list_head lkb_time_list; 247 struct list_head lkb_time_list;
248 unsigned long lkb_timestamp; 248 ktime_t lkb_time_bast; /* for debugging */
249 ktime_t lkb_timestamp;
249 unsigned long lkb_timeout_cs; 250 unsigned long lkb_timeout_cs;
250 251
251 char *lkb_lvbptr; 252 char *lkb_lvbptr;
@@ -481,6 +482,7 @@ struct dlm_ls {
481 struct dentry *ls_debug_rsb_dentry; /* debugfs */ 482 struct dentry *ls_debug_rsb_dentry; /* debugfs */
482 struct dentry *ls_debug_waiters_dentry; /* debugfs */ 483 struct dentry *ls_debug_waiters_dentry; /* debugfs */
483 struct dentry *ls_debug_locks_dentry; /* debugfs */ 484 struct dentry *ls_debug_locks_dentry; /* debugfs */
485 struct dentry *ls_debug_all_dentry; /* debugfs */
484 486
485 wait_queue_head_t ls_uevent_wait; /* user part of join/leave */ 487 wait_queue_head_t ls_uevent_wait; /* user part of join/leave */
486 int ls_uevent_result; 488 int ls_uevent_result;
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 724ddac91538..01e7d39c5fba 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -307,7 +307,7 @@ static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
307 lkb->lkb_lksb->sb_status = rv; 307 lkb->lkb_lksb->sb_status = rv;
308 lkb->lkb_lksb->sb_flags = lkb->lkb_sbflags; 308 lkb->lkb_lksb->sb_flags = lkb->lkb_sbflags;
309 309
310 dlm_add_ast(lkb, AST_COMP); 310 dlm_add_ast(lkb, AST_COMP, 0);
311} 311}
312 312
313static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb) 313static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb)
@@ -318,12 +318,12 @@ static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb)
318 318
319static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode) 319static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode)
320{ 320{
321 lkb->lkb_time_bast = ktime_get();
322
321 if (is_master_copy(lkb)) 323 if (is_master_copy(lkb))
322 send_bast(r, lkb, rqmode); 324 send_bast(r, lkb, rqmode);
323 else { 325 else
324 lkb->lkb_bastmode = rqmode; 326 dlm_add_ast(lkb, AST_BAST, rqmode);
325 dlm_add_ast(lkb, AST_BAST);
326 }
327} 327}
328 328
329/* 329/*
@@ -412,9 +412,9 @@ static int search_rsb(struct dlm_ls *ls, char *name, int len, int b,
412 unsigned int flags, struct dlm_rsb **r_ret) 412 unsigned int flags, struct dlm_rsb **r_ret)
413{ 413{
414 int error; 414 int error;
415 write_lock(&ls->ls_rsbtbl[b].lock); 415 spin_lock(&ls->ls_rsbtbl[b].lock);
416 error = _search_rsb(ls, name, len, b, flags, r_ret); 416 error = _search_rsb(ls, name, len, b, flags, r_ret);
417 write_unlock(&ls->ls_rsbtbl[b].lock); 417 spin_unlock(&ls->ls_rsbtbl[b].lock);
418 return error; 418 return error;
419} 419}
420 420
@@ -478,16 +478,16 @@ static int find_rsb(struct dlm_ls *ls, char *name, int namelen,
478 r->res_nodeid = nodeid; 478 r->res_nodeid = nodeid;
479 } 479 }
480 480
481 write_lock(&ls->ls_rsbtbl[bucket].lock); 481 spin_lock(&ls->ls_rsbtbl[bucket].lock);
482 error = _search_rsb(ls, name, namelen, bucket, 0, &tmp); 482 error = _search_rsb(ls, name, namelen, bucket, 0, &tmp);
483 if (!error) { 483 if (!error) {
484 write_unlock(&ls->ls_rsbtbl[bucket].lock); 484 spin_unlock(&ls->ls_rsbtbl[bucket].lock);
485 dlm_free_rsb(r); 485 dlm_free_rsb(r);
486 r = tmp; 486 r = tmp;
487 goto out; 487 goto out;
488 } 488 }
489 list_add(&r->res_hashchain, &ls->ls_rsbtbl[bucket].list); 489 list_add(&r->res_hashchain, &ls->ls_rsbtbl[bucket].list);
490 write_unlock(&ls->ls_rsbtbl[bucket].lock); 490 spin_unlock(&ls->ls_rsbtbl[bucket].lock);
491 error = 0; 491 error = 0;
492 out: 492 out:
493 *r_ret = r; 493 *r_ret = r;
@@ -530,9 +530,9 @@ static void put_rsb(struct dlm_rsb *r)
530 struct dlm_ls *ls = r->res_ls; 530 struct dlm_ls *ls = r->res_ls;
531 uint32_t bucket = r->res_bucket; 531 uint32_t bucket = r->res_bucket;
532 532
533 write_lock(&ls->ls_rsbtbl[bucket].lock); 533 spin_lock(&ls->ls_rsbtbl[bucket].lock);
534 kref_put(&r->res_ref, toss_rsb); 534 kref_put(&r->res_ref, toss_rsb);
535 write_unlock(&ls->ls_rsbtbl[bucket].lock); 535 spin_unlock(&ls->ls_rsbtbl[bucket].lock);
536} 536}
537 537
538void dlm_put_rsb(struct dlm_rsb *r) 538void dlm_put_rsb(struct dlm_rsb *r)
@@ -744,6 +744,8 @@ static void add_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int status)
744 744
745 DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb);); 745 DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb););
746 746
747 lkb->lkb_timestamp = ktime_get();
748
747 lkb->lkb_status = status; 749 lkb->lkb_status = status;
748 750
749 switch (status) { 751 switch (status) {
@@ -965,7 +967,7 @@ static int shrink_bucket(struct dlm_ls *ls, int b)
965 967
966 for (;;) { 968 for (;;) {
967 found = 0; 969 found = 0;
968 write_lock(&ls->ls_rsbtbl[b].lock); 970 spin_lock(&ls->ls_rsbtbl[b].lock);
969 list_for_each_entry_reverse(r, &ls->ls_rsbtbl[b].toss, 971 list_for_each_entry_reverse(r, &ls->ls_rsbtbl[b].toss,
970 res_hashchain) { 972 res_hashchain) {
971 if (!time_after_eq(jiffies, r->res_toss_time + 973 if (!time_after_eq(jiffies, r->res_toss_time +
@@ -976,20 +978,20 @@ static int shrink_bucket(struct dlm_ls *ls, int b)
976 } 978 }
977 979
978 if (!found) { 980 if (!found) {
979 write_unlock(&ls->ls_rsbtbl[b].lock); 981 spin_unlock(&ls->ls_rsbtbl[b].lock);
980 break; 982 break;
981 } 983 }
982 984
983 if (kref_put(&r->res_ref, kill_rsb)) { 985 if (kref_put(&r->res_ref, kill_rsb)) {
984 list_del(&r->res_hashchain); 986 list_del(&r->res_hashchain);
985 write_unlock(&ls->ls_rsbtbl[b].lock); 987 spin_unlock(&ls->ls_rsbtbl[b].lock);
986 988
987 if (is_master(r)) 989 if (is_master(r))
988 dir_remove(r); 990 dir_remove(r);
989 dlm_free_rsb(r); 991 dlm_free_rsb(r);
990 count++; 992 count++;
991 } else { 993 } else {
992 write_unlock(&ls->ls_rsbtbl[b].lock); 994 spin_unlock(&ls->ls_rsbtbl[b].lock);
993 log_error(ls, "tossed rsb in use %s", r->res_name); 995 log_error(ls, "tossed rsb in use %s", r->res_name);
994 } 996 }
995 } 997 }
@@ -1013,10 +1015,8 @@ static void add_timeout(struct dlm_lkb *lkb)
1013{ 1015{
1014 struct dlm_ls *ls = lkb->lkb_resource->res_ls; 1016 struct dlm_ls *ls = lkb->lkb_resource->res_ls;
1015 1017
1016 if (is_master_copy(lkb)) { 1018 if (is_master_copy(lkb))
1017 lkb->lkb_timestamp = jiffies;
1018 return; 1019 return;
1019 }
1020 1020
1021 if (test_bit(LSFL_TIMEWARN, &ls->ls_flags) && 1021 if (test_bit(LSFL_TIMEWARN, &ls->ls_flags) &&
1022 !(lkb->lkb_exflags & DLM_LKF_NODLCKWT)) { 1022 !(lkb->lkb_exflags & DLM_LKF_NODLCKWT)) {
@@ -1031,7 +1031,6 @@ static void add_timeout(struct dlm_lkb *lkb)
1031 DLM_ASSERT(list_empty(&lkb->lkb_time_list), dlm_print_lkb(lkb);); 1031 DLM_ASSERT(list_empty(&lkb->lkb_time_list), dlm_print_lkb(lkb););
1032 mutex_lock(&ls->ls_timeout_mutex); 1032 mutex_lock(&ls->ls_timeout_mutex);
1033 hold_lkb(lkb); 1033 hold_lkb(lkb);
1034 lkb->lkb_timestamp = jiffies;
1035 list_add_tail(&lkb->lkb_time_list, &ls->ls_timeout); 1034 list_add_tail(&lkb->lkb_time_list, &ls->ls_timeout);
1036 mutex_unlock(&ls->ls_timeout_mutex); 1035 mutex_unlock(&ls->ls_timeout_mutex);
1037} 1036}
@@ -1059,6 +1058,7 @@ void dlm_scan_timeout(struct dlm_ls *ls)
1059 struct dlm_rsb *r; 1058 struct dlm_rsb *r;
1060 struct dlm_lkb *lkb; 1059 struct dlm_lkb *lkb;
1061 int do_cancel, do_warn; 1060 int do_cancel, do_warn;
1061 s64 wait_us;
1062 1062
1063 for (;;) { 1063 for (;;) {
1064 if (dlm_locking_stopped(ls)) 1064 if (dlm_locking_stopped(ls))
@@ -1069,14 +1069,15 @@ void dlm_scan_timeout(struct dlm_ls *ls)
1069 mutex_lock(&ls->ls_timeout_mutex); 1069 mutex_lock(&ls->ls_timeout_mutex);
1070 list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list) { 1070 list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list) {
1071 1071
1072 wait_us = ktime_to_us(ktime_sub(ktime_get(),
1073 lkb->lkb_timestamp));
1074
1072 if ((lkb->lkb_exflags & DLM_LKF_TIMEOUT) && 1075 if ((lkb->lkb_exflags & DLM_LKF_TIMEOUT) &&
1073 time_after_eq(jiffies, lkb->lkb_timestamp + 1076 wait_us >= (lkb->lkb_timeout_cs * 10000))
1074 lkb->lkb_timeout_cs * HZ/100))
1075 do_cancel = 1; 1077 do_cancel = 1;
1076 1078
1077 if ((lkb->lkb_flags & DLM_IFL_WATCH_TIMEWARN) && 1079 if ((lkb->lkb_flags & DLM_IFL_WATCH_TIMEWARN) &&
1078 time_after_eq(jiffies, lkb->lkb_timestamp + 1080 wait_us >= dlm_config.ci_timewarn_cs * 10000)
1079 dlm_config.ci_timewarn_cs * HZ/100))
1080 do_warn = 1; 1081 do_warn = 1;
1081 1082
1082 if (!do_cancel && !do_warn) 1083 if (!do_cancel && !do_warn)
@@ -1122,12 +1123,12 @@ void dlm_scan_timeout(struct dlm_ls *ls)
1122void dlm_adjust_timeouts(struct dlm_ls *ls) 1123void dlm_adjust_timeouts(struct dlm_ls *ls)
1123{ 1124{
1124 struct dlm_lkb *lkb; 1125 struct dlm_lkb *lkb;
1125 long adj = jiffies - ls->ls_recover_begin; 1126 u64 adj_us = jiffies_to_usecs(jiffies - ls->ls_recover_begin);
1126 1127
1127 ls->ls_recover_begin = 0; 1128 ls->ls_recover_begin = 0;
1128 mutex_lock(&ls->ls_timeout_mutex); 1129 mutex_lock(&ls->ls_timeout_mutex);
1129 list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list) 1130 list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list)
1130 lkb->lkb_timestamp += adj; 1131 lkb->lkb_timestamp = ktime_add_us(lkb->lkb_timestamp, adj_us);
1131 mutex_unlock(&ls->ls_timeout_mutex); 1132 mutex_unlock(&ls->ls_timeout_mutex);
1132} 1133}
1133 1134
@@ -4223,7 +4224,7 @@ static struct dlm_rsb *find_purged_rsb(struct dlm_ls *ls, int bucket)
4223{ 4224{
4224 struct dlm_rsb *r, *r_ret = NULL; 4225 struct dlm_rsb *r, *r_ret = NULL;
4225 4226
4226 read_lock(&ls->ls_rsbtbl[bucket].lock); 4227 spin_lock(&ls->ls_rsbtbl[bucket].lock);
4227 list_for_each_entry(r, &ls->ls_rsbtbl[bucket].list, res_hashchain) { 4228 list_for_each_entry(r, &ls->ls_rsbtbl[bucket].list, res_hashchain) {
4228 if (!rsb_flag(r, RSB_LOCKS_PURGED)) 4229 if (!rsb_flag(r, RSB_LOCKS_PURGED))
4229 continue; 4230 continue;
@@ -4232,7 +4233,7 @@ static struct dlm_rsb *find_purged_rsb(struct dlm_ls *ls, int bucket)
4232 r_ret = r; 4233 r_ret = r;
4233 break; 4234 break;
4234 } 4235 }
4235 read_unlock(&ls->ls_rsbtbl[bucket].lock); 4236 spin_unlock(&ls->ls_rsbtbl[bucket].lock);
4236 return r_ret; 4237 return r_ret;
4237} 4238}
4238 4239
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 8d86b7960f0d..aa32e5f02493 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -464,7 +464,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
464 for (i = 0; i < size; i++) { 464 for (i = 0; i < size; i++) {
465 INIT_LIST_HEAD(&ls->ls_rsbtbl[i].list); 465 INIT_LIST_HEAD(&ls->ls_rsbtbl[i].list);
466 INIT_LIST_HEAD(&ls->ls_rsbtbl[i].toss); 466 INIT_LIST_HEAD(&ls->ls_rsbtbl[i].toss);
467 rwlock_init(&ls->ls_rsbtbl[i].lock); 467 spin_lock_init(&ls->ls_rsbtbl[i].lock);
468 } 468 }
469 469
470 size = dlm_config.ci_lkbtbl_size; 470 size = dlm_config.ci_lkbtbl_size;
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 3962262f991a..103a5ebd1371 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -295,6 +295,7 @@ static int add_sock(struct socket *sock, struct connection *con)
295 con->sock->sk->sk_write_space = lowcomms_write_space; 295 con->sock->sk->sk_write_space = lowcomms_write_space;
296 con->sock->sk->sk_state_change = lowcomms_state_change; 296 con->sock->sk->sk_state_change = lowcomms_state_change;
297 con->sock->sk->sk_user_data = con; 297 con->sock->sk->sk_user_data = con;
298 con->sock->sk->sk_allocation = GFP_NOFS;
298 return 0; 299 return 0;
299} 300}
300 301
@@ -823,7 +824,6 @@ static void sctp_init_assoc(struct connection *con)
823 len = e->len; 824 len = e->len;
824 offset = e->offset; 825 offset = e->offset;
825 spin_unlock(&con->writequeue_lock); 826 spin_unlock(&con->writequeue_lock);
826 kmap(e->page);
827 827
828 /* Send the first block off the write queue */ 828 /* Send the first block off the write queue */
829 iov[0].iov_base = page_address(e->page)+offset; 829 iov[0].iov_base = page_address(e->page)+offset;
@@ -854,7 +854,6 @@ static void sctp_init_assoc(struct connection *con)
854 854
855 if (e->len == 0 && e->users == 0) { 855 if (e->len == 0 && e->users == 0) {
856 list_del(&e->list); 856 list_del(&e->list);
857 kunmap(e->page);
858 free_entry(e); 857 free_entry(e);
859 } 858 }
860 spin_unlock(&con->writequeue_lock); 859 spin_unlock(&con->writequeue_lock);
@@ -1203,8 +1202,6 @@ void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc)
1203 1202
1204 if (e) { 1203 if (e) {
1205 got_one: 1204 got_one:
1206 if (users == 0)
1207 kmap(e->page);
1208 *ppc = page_address(e->page) + offset; 1205 *ppc = page_address(e->page) + offset;
1209 return e; 1206 return e;
1210 } 1207 }
@@ -1233,7 +1230,6 @@ void dlm_lowcomms_commit_buffer(void *mh)
1233 if (users) 1230 if (users)
1234 goto out; 1231 goto out;
1235 e->len = e->end - e->offset; 1232 e->len = e->end - e->offset;
1236 kunmap(e->page);
1237 spin_unlock(&con->writequeue_lock); 1233 spin_unlock(&con->writequeue_lock);
1238 1234
1239 if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags)) { 1235 if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags)) {
@@ -1272,7 +1268,6 @@ static void send_to_sock(struct connection *con)
1272 offset = e->offset; 1268 offset = e->offset;
1273 BUG_ON(len == 0 && e->users == 0); 1269 BUG_ON(len == 0 && e->users == 0);
1274 spin_unlock(&con->writequeue_lock); 1270 spin_unlock(&con->writequeue_lock);
1275 kmap(e->page);
1276 1271
1277 ret = 0; 1272 ret = 0;
1278 if (len) { 1273 if (len) {
@@ -1294,7 +1289,6 @@ static void send_to_sock(struct connection *con)
1294 1289
1295 if (e->len == 0 && e->users == 0) { 1290 if (e->len == 0 && e->users == 0) {
1296 list_del(&e->list); 1291 list_del(&e->list);
1297 kunmap(e->page);
1298 free_entry(e); 1292 free_entry(e);
1299 continue; 1293 continue;
1300 } 1294 }
diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c
index 54c14c6d06cb..c1775b84ebab 100644
--- a/fs/dlm/memory.c
+++ b/fs/dlm/memory.c
@@ -39,7 +39,7 @@ char *dlm_allocate_lvb(struct dlm_ls *ls)
39{ 39{
40 char *p; 40 char *p;
41 41
42 p = kzalloc(ls->ls_lvblen, GFP_KERNEL); 42 p = kzalloc(ls->ls_lvblen, ls->ls_allocation);
43 return p; 43 return p;
44} 44}
45 45
@@ -57,7 +57,7 @@ struct dlm_rsb *dlm_allocate_rsb(struct dlm_ls *ls, int namelen)
57 57
58 DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,); 58 DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
59 59
60 r = kzalloc(sizeof(*r) + namelen, GFP_KERNEL); 60 r = kzalloc(sizeof(*r) + namelen, ls->ls_allocation);
61 return r; 61 return r;
62} 62}
63 63
@@ -72,7 +72,7 @@ struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls)
72{ 72{
73 struct dlm_lkb *lkb; 73 struct dlm_lkb *lkb;
74 74
75 lkb = kmem_cache_zalloc(lkb_cache, GFP_KERNEL); 75 lkb = kmem_cache_zalloc(lkb_cache, ls->ls_allocation);
76 return lkb; 76 return lkb;
77} 77}
78 78
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
index 07ac709f3ed7..f3396c622aec 100644
--- a/fs/dlm/midcomms.c
+++ b/fs/dlm/midcomms.c
@@ -112,7 +112,7 @@ int dlm_process_incoming_buffer(int nodeid, const void *base,
112 ordinary messages). */ 112 ordinary messages). */
113 113
114 if (msglen > sizeof(__tmp) && p == &__tmp.p) { 114 if (msglen > sizeof(__tmp) && p == &__tmp.p) {
115 p = kmalloc(dlm_config.ci_buffer_size, GFP_KERNEL); 115 p = kmalloc(dlm_config.ci_buffer_size, GFP_NOFS);
116 if (p == NULL) 116 if (p == NULL)
117 return ret; 117 return ret;
118 } 118 }
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index aa2a5775a027..ccc9d62c462d 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -115,7 +115,6 @@ static void fill_data(struct dlm_lock_data *data, struct dlm_lkb *lkb)
115 data->status = lkb->lkb_status; 115 data->status = lkb->lkb_status;
116 data->grmode = lkb->lkb_grmode; 116 data->grmode = lkb->lkb_grmode;
117 data->rqmode = lkb->lkb_rqmode; 117 data->rqmode = lkb->lkb_rqmode;
118 data->timestamp = lkb->lkb_timestamp;
119 if (lkb->lkb_ua) 118 if (lkb->lkb_ua)
120 data->xid = lkb->lkb_ua->xid; 119 data->xid = lkb->lkb_ua->xid;
121 if (r) { 120 if (r) {
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index eba87ff3177b..894a32d438d5 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -168,7 +168,7 @@ static int dlm_plock_callback(struct plock_op *op)
168 notify = xop->callback; 168 notify = xop->callback;
169 169
170 if (op->info.rv) { 170 if (op->info.rv) {
171 notify(flc, NULL, op->info.rv); 171 notify(fl, NULL, op->info.rv);
172 goto out; 172 goto out;
173 } 173 }
174 174
@@ -187,7 +187,7 @@ static int dlm_plock_callback(struct plock_op *op)
187 (unsigned long long)op->info.number, file, fl); 187 (unsigned long long)op->info.number, file, fl);
188 } 188 }
189 189
190 rv = notify(flc, NULL, 0); 190 rv = notify(fl, NULL, 0);
191 if (rv) { 191 if (rv) {
192 /* XXX: We need to cancel the fs lock here: */ 192 /* XXX: We need to cancel the fs lock here: */
193 log_print("dlm_plock_callback: lock granted after lock request " 193 log_print("dlm_plock_callback: lock granted after lock request "
@@ -304,7 +304,9 @@ int dlm_posix_get(dlm_lockspace_t *lockspace, u64 number, struct file *file,
304 if (rv == -ENOENT) 304 if (rv == -ENOENT)
305 rv = 0; 305 rv = 0;
306 else if (rv > 0) { 306 else if (rv > 0) {
307 locks_init_lock(fl);
307 fl->fl_type = (op->info.ex) ? F_WRLCK : F_RDLCK; 308 fl->fl_type = (op->info.ex) ? F_WRLCK : F_RDLCK;
309 fl->fl_flags = FL_POSIX;
308 fl->fl_pid = op->info.pid; 310 fl->fl_pid = op->info.pid;
309 fl->fl_start = op->info.start; 311 fl->fl_start = op->info.start;
310 fl->fl_end = op->info.end; 312 fl->fl_end = op->info.end;
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
index 80aba5bdd4a4..eda43f362616 100644
--- a/fs/dlm/recover.c
+++ b/fs/dlm/recover.c
@@ -726,7 +726,7 @@ int dlm_create_root_list(struct dlm_ls *ls)
726 } 726 }
727 727
728 for (i = 0; i < ls->ls_rsbtbl_size; i++) { 728 for (i = 0; i < ls->ls_rsbtbl_size; i++) {
729 read_lock(&ls->ls_rsbtbl[i].lock); 729 spin_lock(&ls->ls_rsbtbl[i].lock);
730 list_for_each_entry(r, &ls->ls_rsbtbl[i].list, res_hashchain) { 730 list_for_each_entry(r, &ls->ls_rsbtbl[i].list, res_hashchain) {
731 list_add(&r->res_root_list, &ls->ls_root_list); 731 list_add(&r->res_root_list, &ls->ls_root_list);
732 dlm_hold_rsb(r); 732 dlm_hold_rsb(r);
@@ -737,7 +737,7 @@ int dlm_create_root_list(struct dlm_ls *ls)
737 but no other recovery steps should do anything with them. */ 737 but no other recovery steps should do anything with them. */
738 738
739 if (dlm_no_directory(ls)) { 739 if (dlm_no_directory(ls)) {
740 read_unlock(&ls->ls_rsbtbl[i].lock); 740 spin_unlock(&ls->ls_rsbtbl[i].lock);
741 continue; 741 continue;
742 } 742 }
743 743
@@ -745,7 +745,7 @@ int dlm_create_root_list(struct dlm_ls *ls)
745 list_add(&r->res_root_list, &ls->ls_root_list); 745 list_add(&r->res_root_list, &ls->ls_root_list);
746 dlm_hold_rsb(r); 746 dlm_hold_rsb(r);
747 } 747 }
748 read_unlock(&ls->ls_rsbtbl[i].lock); 748 spin_unlock(&ls->ls_rsbtbl[i].lock);
749 } 749 }
750 out: 750 out:
751 up_write(&ls->ls_root_sem); 751 up_write(&ls->ls_root_sem);
@@ -775,7 +775,7 @@ void dlm_clear_toss_list(struct dlm_ls *ls)
775 int i; 775 int i;
776 776
777 for (i = 0; i < ls->ls_rsbtbl_size; i++) { 777 for (i = 0; i < ls->ls_rsbtbl_size; i++) {
778 write_lock(&ls->ls_rsbtbl[i].lock); 778 spin_lock(&ls->ls_rsbtbl[i].lock);
779 list_for_each_entry_safe(r, safe, &ls->ls_rsbtbl[i].toss, 779 list_for_each_entry_safe(r, safe, &ls->ls_rsbtbl[i].toss,
780 res_hashchain) { 780 res_hashchain) {
781 if (dlm_no_directory(ls) || !is_master(r)) { 781 if (dlm_no_directory(ls) || !is_master(r)) {
@@ -783,7 +783,7 @@ void dlm_clear_toss_list(struct dlm_ls *ls)
783 dlm_free_rsb(r); 783 dlm_free_rsb(r);
784 } 784 }
785 } 785 }
786 write_unlock(&ls->ls_rsbtbl[i].lock); 786 spin_unlock(&ls->ls_rsbtbl[i].lock);
787 } 787 }
788} 788}
789 789
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index b3832c67194a..065149e84f42 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -175,7 +175,7 @@ static int lkb_is_endoflife(struct dlm_lkb *lkb, int sb_status, int type)
175/* we could possibly check if the cancel of an orphan has resulted in the lkb 175/* we could possibly check if the cancel of an orphan has resulted in the lkb
176 being removed and then remove that lkb from the orphans list and free it */ 176 being removed and then remove that lkb from the orphans list and free it */
177 177
178void dlm_user_add_ast(struct dlm_lkb *lkb, int type) 178void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int bastmode)
179{ 179{
180 struct dlm_ls *ls; 180 struct dlm_ls *ls;
181 struct dlm_user_args *ua; 181 struct dlm_user_args *ua;
@@ -208,6 +208,8 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type)
208 208
209 ast_type = lkb->lkb_ast_type; 209 ast_type = lkb->lkb_ast_type;
210 lkb->lkb_ast_type |= type; 210 lkb->lkb_ast_type |= type;
211 if (bastmode)
212 lkb->lkb_bastmode = bastmode;
211 213
212 if (!ast_type) { 214 if (!ast_type) {
213 kref_get(&lkb->lkb_ref); 215 kref_get(&lkb->lkb_ref);
diff --git a/fs/dlm/user.h b/fs/dlm/user.h
index 35eb6a13d616..1c9686492286 100644
--- a/fs/dlm/user.h
+++ b/fs/dlm/user.h
@@ -9,7 +9,7 @@
9#ifndef __USER_DOT_H__ 9#ifndef __USER_DOT_H__
10#define __USER_DOT_H__ 10#define __USER_DOT_H__
11 11
12void dlm_user_add_ast(struct dlm_lkb *lkb, int type); 12void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int bastmode);
13int dlm_user_init(void); 13int dlm_user_init(void);
14void dlm_user_exit(void); 14void dlm_user_exit(void);
15int dlm_device_deregister(struct dlm_ls *ls); 15int dlm_device_deregister(struct dlm_ls *ls);
diff --git a/fs/dquot.c b/fs/dquot.c
index c237ccc8581c..bca3cac4bee7 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -87,14 +87,17 @@
87#define __DQUOT_PARANOIA 87#define __DQUOT_PARANOIA
88 88
89/* 89/*
90 * There are two quota SMP locks. dq_list_lock protects all lists with quotas 90 * There are three quota SMP locks. dq_list_lock protects all lists with quotas
91 * and quota formats and also dqstats structure containing statistics about the 91 * and quota formats, dqstats structure containing statistics about the lists
92 * lists. dq_data_lock protects data from dq_dqb and also mem_dqinfo structures 92 * dq_data_lock protects data from dq_dqb and also mem_dqinfo structures and
93 * and also guards consistency of dquot->dq_dqb with inode->i_blocks, i_bytes. 93 * also guards consistency of dquot->dq_dqb with inode->i_blocks, i_bytes.
94 * i_blocks and i_bytes updates itself are guarded by i_lock acquired directly 94 * i_blocks and i_bytes updates itself are guarded by i_lock acquired directly
95 * in inode_add_bytes() and inode_sub_bytes(). 95 * in inode_add_bytes() and inode_sub_bytes(). dq_state_lock protects
96 * modifications of quota state (on quotaon and quotaoff) and readers who care
97 * about latest values take it as well.
96 * 98 *
97 * The spinlock ordering is hence: dq_data_lock > dq_list_lock > i_lock 99 * The spinlock ordering is hence: dq_data_lock > dq_list_lock > i_lock,
100 * dq_list_lock > dq_state_lock
98 * 101 *
99 * Note that some things (eg. sb pointer, type, id) doesn't change during 102 * Note that some things (eg. sb pointer, type, id) doesn't change during
100 * the life of the dquot structure and so needn't to be protected by a lock 103 * the life of the dquot structure and so needn't to be protected by a lock
@@ -103,12 +106,7 @@
103 * operation is just reading pointers from inode (or not using them at all) the 106 * operation is just reading pointers from inode (or not using them at all) the
104 * read lock is enough. If pointers are altered function must hold write lock 107 * read lock is enough. If pointers are altered function must hold write lock
105 * (these locking rules also apply for S_NOQUOTA flag in the inode - note that 108 * (these locking rules also apply for S_NOQUOTA flag in the inode - note that
106 * for altering the flag i_mutex is also needed). If operation is holding 109 * for altering the flag i_mutex is also needed).
107 * reference to dquot in other way (e.g. quotactl ops) it must be guarded by
108 * dqonoff_mutex.
109 * This locking assures that:
110 * a) update/access to dquot pointers in inode is serialized
111 * b) everyone is guarded against invalidate_dquots()
112 * 110 *
113 * Each dquot has its dq_lock mutex. Locked dquots might not be referenced 111 * Each dquot has its dq_lock mutex. Locked dquots might not be referenced
114 * from inodes (dquot_alloc_space() and such don't check the dq_lock). 112 * from inodes (dquot_alloc_space() and such don't check the dq_lock).
@@ -122,10 +120,17 @@
122 * Lock ordering (including related VFS locks) is the following: 120 * Lock ordering (including related VFS locks) is the following:
123 * i_mutex > dqonoff_sem > journal_lock > dqptr_sem > dquot->dq_lock > 121 * i_mutex > dqonoff_sem > journal_lock > dqptr_sem > dquot->dq_lock >
124 * dqio_mutex 122 * dqio_mutex
123 * The lock ordering of dqptr_sem imposed by quota code is only dqonoff_sem >
124 * dqptr_sem. But filesystem has to count with the fact that functions such as
125 * dquot_alloc_space() acquire dqptr_sem and they usually have to be called
126 * from inside a transaction to keep filesystem consistency after a crash. Also
127 * filesystems usually want to do some IO on dquot from ->mark_dirty which is
128 * called with dqptr_sem held.
125 * i_mutex on quota files is special (it's below dqio_mutex) 129 * i_mutex on quota files is special (it's below dqio_mutex)
126 */ 130 */
127 131
128static DEFINE_SPINLOCK(dq_list_lock); 132static DEFINE_SPINLOCK(dq_list_lock);
133static DEFINE_SPINLOCK(dq_state_lock);
129DEFINE_SPINLOCK(dq_data_lock); 134DEFINE_SPINLOCK(dq_data_lock);
130 135
131static char *quotatypes[] = INITQFNAMES; 136static char *quotatypes[] = INITQFNAMES;
@@ -211,8 +216,6 @@ static struct hlist_head *dquot_hash;
211 216
212struct dqstats dqstats; 217struct dqstats dqstats;
213 218
214static void dqput(struct dquot *dquot);
215
216static inline unsigned int 219static inline unsigned int
217hashfn(const struct super_block *sb, unsigned int id, int type) 220hashfn(const struct super_block *sb, unsigned int id, int type)
218{ 221{
@@ -415,11 +418,22 @@ out_dqlock:
415 return ret; 418 return ret;
416} 419}
417 420
421void dquot_destroy(struct dquot *dquot)
422{
423 kmem_cache_free(dquot_cachep, dquot);
424}
425EXPORT_SYMBOL(dquot_destroy);
426
427static inline void do_destroy_dquot(struct dquot *dquot)
428{
429 dquot->dq_sb->dq_op->destroy_dquot(dquot);
430}
431
418/* Invalidate all dquots on the list. Note that this function is called after 432/* Invalidate all dquots on the list. Note that this function is called after
419 * quota is disabled and pointers from inodes removed so there cannot be new 433 * quota is disabled and pointers from inodes removed so there cannot be new
420 * quota users. There can still be some users of quotas due to inodes being 434 * quota users. There can still be some users of quotas due to inodes being
421 * just deleted or pruned by prune_icache() (those are not attached to any 435 * just deleted or pruned by prune_icache() (those are not attached to any
422 * list). We have to wait for such users. 436 * list) or parallel quotactl call. We have to wait for such users.
423 */ 437 */
424static void invalidate_dquots(struct super_block *sb, int type) 438static void invalidate_dquots(struct super_block *sb, int type)
425{ 439{
@@ -463,11 +477,46 @@ restart:
463 remove_dquot_hash(dquot); 477 remove_dquot_hash(dquot);
464 remove_free_dquot(dquot); 478 remove_free_dquot(dquot);
465 remove_inuse(dquot); 479 remove_inuse(dquot);
466 kmem_cache_free(dquot_cachep, dquot); 480 do_destroy_dquot(dquot);
467 } 481 }
468 spin_unlock(&dq_list_lock); 482 spin_unlock(&dq_list_lock);
469} 483}
470 484
485/* Call callback for every active dquot on given filesystem */
486int dquot_scan_active(struct super_block *sb,
487 int (*fn)(struct dquot *dquot, unsigned long priv),
488 unsigned long priv)
489{
490 struct dquot *dquot, *old_dquot = NULL;
491 int ret = 0;
492
493 mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
494 spin_lock(&dq_list_lock);
495 list_for_each_entry(dquot, &inuse_list, dq_inuse) {
496 if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags))
497 continue;
498 if (dquot->dq_sb != sb)
499 continue;
500 /* Now we have active dquot so we can just increase use count */
501 atomic_inc(&dquot->dq_count);
502 dqstats.lookups++;
503 spin_unlock(&dq_list_lock);
504 dqput(old_dquot);
505 old_dquot = dquot;
506 ret = fn(dquot, priv);
507 if (ret < 0)
508 goto out;
509 spin_lock(&dq_list_lock);
510 /* We are safe to continue now because our dquot could not
511 * be moved out of the inuse list while we hold the reference */
512 }
513 spin_unlock(&dq_list_lock);
514out:
515 dqput(old_dquot);
516 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
517 return ret;
518}
519
471int vfs_quota_sync(struct super_block *sb, int type) 520int vfs_quota_sync(struct super_block *sb, int type)
472{ 521{
473 struct list_head *dirty; 522 struct list_head *dirty;
@@ -479,7 +528,7 @@ int vfs_quota_sync(struct super_block *sb, int type)
479 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 528 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
480 if (type != -1 && cnt != type) 529 if (type != -1 && cnt != type)
481 continue; 530 continue;
482 if (!sb_has_quota_enabled(sb, cnt)) 531 if (!sb_has_quota_active(sb, cnt))
483 continue; 532 continue;
484 spin_lock(&dq_list_lock); 533 spin_lock(&dq_list_lock);
485 dirty = &dqopt->info[cnt].dqi_dirty_list; 534 dirty = &dqopt->info[cnt].dqi_dirty_list;
@@ -504,8 +553,8 @@ int vfs_quota_sync(struct super_block *sb, int type)
504 } 553 }
505 554
506 for (cnt = 0; cnt < MAXQUOTAS; cnt++) 555 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
507 if ((cnt == type || type == -1) && sb_has_quota_enabled(sb, cnt) 556 if ((cnt == type || type == -1) && sb_has_quota_active(sb, cnt)
508 && info_dirty(&dqopt->info[cnt])) 557 && info_dirty(&dqopt->info[cnt]))
509 sb->dq_op->write_info(sb, cnt); 558 sb->dq_op->write_info(sb, cnt);
510 spin_lock(&dq_list_lock); 559 spin_lock(&dq_list_lock);
511 dqstats.syncs++; 560 dqstats.syncs++;
@@ -527,7 +576,7 @@ static void prune_dqcache(int count)
527 remove_dquot_hash(dquot); 576 remove_dquot_hash(dquot);
528 remove_free_dquot(dquot); 577 remove_free_dquot(dquot);
529 remove_inuse(dquot); 578 remove_inuse(dquot);
530 kmem_cache_free(dquot_cachep, dquot); 579 do_destroy_dquot(dquot);
531 count--; 580 count--;
532 head = free_dquots.prev; 581 head = free_dquots.prev;
533 } 582 }
@@ -556,9 +605,8 @@ static struct shrinker dqcache_shrinker = {
556/* 605/*
557 * Put reference to dquot 606 * Put reference to dquot
558 * NOTE: If you change this function please check whether dqput_blocks() works right... 607 * NOTE: If you change this function please check whether dqput_blocks() works right...
559 * MUST be called with either dqptr_sem or dqonoff_mutex held
560 */ 608 */
561static void dqput(struct dquot *dquot) 609void dqput(struct dquot *dquot)
562{ 610{
563 int ret; 611 int ret;
564 612
@@ -584,7 +632,7 @@ we_slept:
584 /* We have more than one user... nothing to do */ 632 /* We have more than one user... nothing to do */
585 atomic_dec(&dquot->dq_count); 633 atomic_dec(&dquot->dq_count);
586 /* Releasing dquot during quotaoff phase? */ 634 /* Releasing dquot during quotaoff phase? */
587 if (!sb_has_quota_enabled(dquot->dq_sb, dquot->dq_type) && 635 if (!sb_has_quota_active(dquot->dq_sb, dquot->dq_type) &&
588 atomic_read(&dquot->dq_count) == 1) 636 atomic_read(&dquot->dq_count) == 1)
589 wake_up(&dquot->dq_wait_unused); 637 wake_up(&dquot->dq_wait_unused);
590 spin_unlock(&dq_list_lock); 638 spin_unlock(&dq_list_lock);
@@ -625,11 +673,17 @@ we_slept:
625 spin_unlock(&dq_list_lock); 673 spin_unlock(&dq_list_lock);
626} 674}
627 675
676struct dquot *dquot_alloc(struct super_block *sb, int type)
677{
678 return kmem_cache_zalloc(dquot_cachep, GFP_NOFS);
679}
680EXPORT_SYMBOL(dquot_alloc);
681
628static struct dquot *get_empty_dquot(struct super_block *sb, int type) 682static struct dquot *get_empty_dquot(struct super_block *sb, int type)
629{ 683{
630 struct dquot *dquot; 684 struct dquot *dquot;
631 685
632 dquot = kmem_cache_zalloc(dquot_cachep, GFP_NOFS); 686 dquot = sb->dq_op->alloc_dquot(sb, type);
633 if(!dquot) 687 if(!dquot)
634 return NODQUOT; 688 return NODQUOT;
635 689
@@ -648,17 +702,29 @@ static struct dquot *get_empty_dquot(struct super_block *sb, int type)
648 702
649/* 703/*
650 * Get reference to dquot 704 * Get reference to dquot
651 * MUST be called with either dqptr_sem or dqonoff_mutex held 705 *
706 * Locking is slightly tricky here. We are guarded from parallel quotaoff()
707 * destroying our dquot by:
708 * a) checking for quota flags under dq_list_lock and
709 * b) getting a reference to dquot before we release dq_list_lock
652 */ 710 */
653static struct dquot *dqget(struct super_block *sb, unsigned int id, int type) 711struct dquot *dqget(struct super_block *sb, unsigned int id, int type)
654{ 712{
655 unsigned int hashent = hashfn(sb, id, type); 713 unsigned int hashent = hashfn(sb, id, type);
656 struct dquot *dquot, *empty = NODQUOT; 714 struct dquot *dquot = NODQUOT, *empty = NODQUOT;
657 715
658 if (!sb_has_quota_enabled(sb, type)) 716 if (!sb_has_quota_active(sb, type))
659 return NODQUOT; 717 return NODQUOT;
660we_slept: 718we_slept:
661 spin_lock(&dq_list_lock); 719 spin_lock(&dq_list_lock);
720 spin_lock(&dq_state_lock);
721 if (!sb_has_quota_active(sb, type)) {
722 spin_unlock(&dq_state_lock);
723 spin_unlock(&dq_list_lock);
724 goto out;
725 }
726 spin_unlock(&dq_state_lock);
727
662 if ((dquot = find_dquot(hashent, sb, id, type)) == NODQUOT) { 728 if ((dquot = find_dquot(hashent, sb, id, type)) == NODQUOT) {
663 if (empty == NODQUOT) { 729 if (empty == NODQUOT) {
664 spin_unlock(&dq_list_lock); 730 spin_unlock(&dq_list_lock);
@@ -667,6 +733,7 @@ we_slept:
667 goto we_slept; 733 goto we_slept;
668 } 734 }
669 dquot = empty; 735 dquot = empty;
736 empty = NODQUOT;
670 dquot->dq_id = id; 737 dquot->dq_id = id;
671 /* all dquots go on the inuse_list */ 738 /* all dquots go on the inuse_list */
672 put_inuse(dquot); 739 put_inuse(dquot);
@@ -681,8 +748,6 @@ we_slept:
681 dqstats.cache_hits++; 748 dqstats.cache_hits++;
682 dqstats.lookups++; 749 dqstats.lookups++;
683 spin_unlock(&dq_list_lock); 750 spin_unlock(&dq_list_lock);
684 if (empty)
685 kmem_cache_free(dquot_cachep, empty);
686 } 751 }
687 /* Wait for dq_lock - after this we know that either dquot_release() is already 752 /* Wait for dq_lock - after this we know that either dquot_release() is already
688 * finished or it will be canceled due to dq_count > 1 test */ 753 * finished or it will be canceled due to dq_count > 1 test */
@@ -690,11 +755,15 @@ we_slept:
690 /* Read the dquot and instantiate it (everything done only if needed) */ 755 /* Read the dquot and instantiate it (everything done only if needed) */
691 if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags) && sb->dq_op->acquire_dquot(dquot) < 0) { 756 if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags) && sb->dq_op->acquire_dquot(dquot) < 0) {
692 dqput(dquot); 757 dqput(dquot);
693 return NODQUOT; 758 dquot = NODQUOT;
759 goto out;
694 } 760 }
695#ifdef __DQUOT_PARANOIA 761#ifdef __DQUOT_PARANOIA
696 BUG_ON(!dquot->dq_sb); /* Has somebody invalidated entry under us? */ 762 BUG_ON(!dquot->dq_sb); /* Has somebody invalidated entry under us? */
697#endif 763#endif
764out:
765 if (empty)
766 do_destroy_dquot(empty);
698 767
699 return dquot; 768 return dquot;
700} 769}
@@ -820,7 +889,7 @@ static void drop_dquot_ref(struct super_block *sb, int type)
820 } 889 }
821} 890}
822 891
823static inline void dquot_incr_inodes(struct dquot *dquot, unsigned long number) 892static inline void dquot_incr_inodes(struct dquot *dquot, qsize_t number)
824{ 893{
825 dquot->dq_dqb.dqb_curinodes += number; 894 dquot->dq_dqb.dqb_curinodes += number;
826} 895}
@@ -830,9 +899,10 @@ static inline void dquot_incr_space(struct dquot *dquot, qsize_t number)
830 dquot->dq_dqb.dqb_curspace += number; 899 dquot->dq_dqb.dqb_curspace += number;
831} 900}
832 901
833static inline void dquot_decr_inodes(struct dquot *dquot, unsigned long number) 902static inline void dquot_decr_inodes(struct dquot *dquot, qsize_t number)
834{ 903{
835 if (dquot->dq_dqb.dqb_curinodes > number) 904 if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NEGATIVE_USAGE ||
905 dquot->dq_dqb.dqb_curinodes >= number)
836 dquot->dq_dqb.dqb_curinodes -= number; 906 dquot->dq_dqb.dqb_curinodes -= number;
837 else 907 else
838 dquot->dq_dqb.dqb_curinodes = 0; 908 dquot->dq_dqb.dqb_curinodes = 0;
@@ -843,11 +913,12 @@ static inline void dquot_decr_inodes(struct dquot *dquot, unsigned long number)
843 913
844static inline void dquot_decr_space(struct dquot *dquot, qsize_t number) 914static inline void dquot_decr_space(struct dquot *dquot, qsize_t number)
845{ 915{
846 if (dquot->dq_dqb.dqb_curspace > number) 916 if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NEGATIVE_USAGE ||
917 dquot->dq_dqb.dqb_curspace >= number)
847 dquot->dq_dqb.dqb_curspace -= number; 918 dquot->dq_dqb.dqb_curspace -= number;
848 else 919 else
849 dquot->dq_dqb.dqb_curspace = 0; 920 dquot->dq_dqb.dqb_curspace = 0;
850 if (toqb(dquot->dq_dqb.dqb_curspace) <= dquot->dq_dqb.dqb_bsoftlimit) 921 if (dquot->dq_dqb.dqb_curspace <= dquot->dq_dqb.dqb_bsoftlimit)
851 dquot->dq_dqb.dqb_btime = (time_t) 0; 922 dquot->dq_dqb.dqb_btime = (time_t) 0;
852 clear_bit(DQ_BLKS_B, &dquot->dq_flags); 923 clear_bit(DQ_BLKS_B, &dquot->dq_flags);
853} 924}
@@ -1023,10 +1094,11 @@ static inline char ignore_hardlimit(struct dquot *dquot)
1023} 1094}
1024 1095
1025/* needs dq_data_lock */ 1096/* needs dq_data_lock */
1026static int check_idq(struct dquot *dquot, ulong inodes, char *warntype) 1097static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype)
1027{ 1098{
1028 *warntype = QUOTA_NL_NOWARN; 1099 *warntype = QUOTA_NL_NOWARN;
1029 if (inodes <= 0 || test_bit(DQ_FAKE_B, &dquot->dq_flags)) 1100 if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type) ||
1101 test_bit(DQ_FAKE_B, &dquot->dq_flags))
1030 return QUOTA_OK; 1102 return QUOTA_OK;
1031 1103
1032 if (dquot->dq_dqb.dqb_ihardlimit && 1104 if (dquot->dq_dqb.dqb_ihardlimit &&
@@ -1058,11 +1130,12 @@ static int check_idq(struct dquot *dquot, ulong inodes, char *warntype)
1058static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *warntype) 1130static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *warntype)
1059{ 1131{
1060 *warntype = QUOTA_NL_NOWARN; 1132 *warntype = QUOTA_NL_NOWARN;
1061 if (space <= 0 || test_bit(DQ_FAKE_B, &dquot->dq_flags)) 1133 if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type) ||
1134 test_bit(DQ_FAKE_B, &dquot->dq_flags))
1062 return QUOTA_OK; 1135 return QUOTA_OK;
1063 1136
1064 if (dquot->dq_dqb.dqb_bhardlimit && 1137 if (dquot->dq_dqb.dqb_bhardlimit &&
1065 toqb(dquot->dq_dqb.dqb_curspace + space) > dquot->dq_dqb.dqb_bhardlimit && 1138 dquot->dq_dqb.dqb_curspace + space > dquot->dq_dqb.dqb_bhardlimit &&
1066 !ignore_hardlimit(dquot)) { 1139 !ignore_hardlimit(dquot)) {
1067 if (!prealloc) 1140 if (!prealloc)
1068 *warntype = QUOTA_NL_BHARDWARN; 1141 *warntype = QUOTA_NL_BHARDWARN;
@@ -1070,7 +1143,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
1070 } 1143 }
1071 1144
1072 if (dquot->dq_dqb.dqb_bsoftlimit && 1145 if (dquot->dq_dqb.dqb_bsoftlimit &&
1073 toqb(dquot->dq_dqb.dqb_curspace + space) > dquot->dq_dqb.dqb_bsoftlimit && 1146 dquot->dq_dqb.dqb_curspace + space > dquot->dq_dqb.dqb_bsoftlimit &&
1074 dquot->dq_dqb.dqb_btime && get_seconds() >= dquot->dq_dqb.dqb_btime && 1147 dquot->dq_dqb.dqb_btime && get_seconds() >= dquot->dq_dqb.dqb_btime &&
1075 !ignore_hardlimit(dquot)) { 1148 !ignore_hardlimit(dquot)) {
1076 if (!prealloc) 1149 if (!prealloc)
@@ -1079,7 +1152,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
1079 } 1152 }
1080 1153
1081 if (dquot->dq_dqb.dqb_bsoftlimit && 1154 if (dquot->dq_dqb.dqb_bsoftlimit &&
1082 toqb(dquot->dq_dqb.dqb_curspace + space) > dquot->dq_dqb.dqb_bsoftlimit && 1155 dquot->dq_dqb.dqb_curspace + space > dquot->dq_dqb.dqb_bsoftlimit &&
1083 dquot->dq_dqb.dqb_btime == 0) { 1156 dquot->dq_dqb.dqb_btime == 0) {
1084 if (!prealloc) { 1157 if (!prealloc) {
1085 *warntype = QUOTA_NL_BSOFTWARN; 1158 *warntype = QUOTA_NL_BSOFTWARN;
@@ -1096,10 +1169,11 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
1096 return QUOTA_OK; 1169 return QUOTA_OK;
1097} 1170}
1098 1171
1099static int info_idq_free(struct dquot *dquot, ulong inodes) 1172static int info_idq_free(struct dquot *dquot, qsize_t inodes)
1100{ 1173{
1101 if (test_bit(DQ_FAKE_B, &dquot->dq_flags) || 1174 if (test_bit(DQ_FAKE_B, &dquot->dq_flags) ||
1102 dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit) 1175 dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit ||
1176 !sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type))
1103 return QUOTA_NL_NOWARN; 1177 return QUOTA_NL_NOWARN;
1104 1178
1105 if (dquot->dq_dqb.dqb_curinodes - inodes <= dquot->dq_dqb.dqb_isoftlimit) 1179 if (dquot->dq_dqb.dqb_curinodes - inodes <= dquot->dq_dqb.dqb_isoftlimit)
@@ -1113,71 +1187,88 @@ static int info_idq_free(struct dquot *dquot, ulong inodes)
1113static int info_bdq_free(struct dquot *dquot, qsize_t space) 1187static int info_bdq_free(struct dquot *dquot, qsize_t space)
1114{ 1188{
1115 if (test_bit(DQ_FAKE_B, &dquot->dq_flags) || 1189 if (test_bit(DQ_FAKE_B, &dquot->dq_flags) ||
1116 toqb(dquot->dq_dqb.dqb_curspace) <= dquot->dq_dqb.dqb_bsoftlimit) 1190 dquot->dq_dqb.dqb_curspace <= dquot->dq_dqb.dqb_bsoftlimit)
1117 return QUOTA_NL_NOWARN; 1191 return QUOTA_NL_NOWARN;
1118 1192
1119 if (toqb(dquot->dq_dqb.dqb_curspace - space) <= 1193 if (dquot->dq_dqb.dqb_curspace - space <= dquot->dq_dqb.dqb_bsoftlimit)
1120 dquot->dq_dqb.dqb_bsoftlimit)
1121 return QUOTA_NL_BSOFTBELOW; 1194 return QUOTA_NL_BSOFTBELOW;
1122 if (toqb(dquot->dq_dqb.dqb_curspace) >= dquot->dq_dqb.dqb_bhardlimit && 1195 if (dquot->dq_dqb.dqb_curspace >= dquot->dq_dqb.dqb_bhardlimit &&
1123 toqb(dquot->dq_dqb.dqb_curspace - space) < 1196 dquot->dq_dqb.dqb_curspace - space < dquot->dq_dqb.dqb_bhardlimit)
1124 dquot->dq_dqb.dqb_bhardlimit)
1125 return QUOTA_NL_BHARDBELOW; 1197 return QUOTA_NL_BHARDBELOW;
1126 return QUOTA_NL_NOWARN; 1198 return QUOTA_NL_NOWARN;
1127} 1199}
1128/* 1200/*
1129 * Initialize quota pointers in inode 1201 * Initialize quota pointers in inode
1130 * Transaction must be started at entry 1202 * We do things in a bit complicated way but by that we avoid calling
1203 * dqget() and thus filesystem callbacks under dqptr_sem.
1131 */ 1204 */
1132int dquot_initialize(struct inode *inode, int type) 1205int dquot_initialize(struct inode *inode, int type)
1133{ 1206{
1134 unsigned int id = 0; 1207 unsigned int id = 0;
1135 int cnt, ret = 0; 1208 int cnt, ret = 0;
1209 struct dquot *got[MAXQUOTAS] = { NODQUOT, NODQUOT };
1210 struct super_block *sb = inode->i_sb;
1136 1211
1137 /* First test before acquiring mutex - solves deadlocks when we 1212 /* First test before acquiring mutex - solves deadlocks when we
1138 * re-enter the quota code and are already holding the mutex */ 1213 * re-enter the quota code and are already holding the mutex */
1139 if (IS_NOQUOTA(inode)) 1214 if (IS_NOQUOTA(inode))
1140 return 0; 1215 return 0;
1141 down_write(&sb_dqopt(inode->i_sb)->dqptr_sem); 1216
1217 /* First get references to structures we might need. */
1218 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1219 if (type != -1 && cnt != type)
1220 continue;
1221 switch (cnt) {
1222 case USRQUOTA:
1223 id = inode->i_uid;
1224 break;
1225 case GRPQUOTA:
1226 id = inode->i_gid;
1227 break;
1228 }
1229 got[cnt] = dqget(sb, id, cnt);
1230 }
1231
1232 down_write(&sb_dqopt(sb)->dqptr_sem);
1142 /* Having dqptr_sem we know NOQUOTA flags can't be altered... */ 1233 /* Having dqptr_sem we know NOQUOTA flags can't be altered... */
1143 if (IS_NOQUOTA(inode)) 1234 if (IS_NOQUOTA(inode))
1144 goto out_err; 1235 goto out_err;
1145 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1236 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1146 if (type != -1 && cnt != type) 1237 if (type != -1 && cnt != type)
1147 continue; 1238 continue;
1239 /* Avoid races with quotaoff() */
1240 if (!sb_has_quota_active(sb, cnt))
1241 continue;
1148 if (inode->i_dquot[cnt] == NODQUOT) { 1242 if (inode->i_dquot[cnt] == NODQUOT) {
1149 switch (cnt) { 1243 inode->i_dquot[cnt] = got[cnt];
1150 case USRQUOTA: 1244 got[cnt] = NODQUOT;
1151 id = inode->i_uid;
1152 break;
1153 case GRPQUOTA:
1154 id = inode->i_gid;
1155 break;
1156 }
1157 inode->i_dquot[cnt] = dqget(inode->i_sb, id, cnt);
1158 } 1245 }
1159 } 1246 }
1160out_err: 1247out_err:
1161 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); 1248 up_write(&sb_dqopt(sb)->dqptr_sem);
1249 /* Drop unused references */
1250 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
1251 dqput(got[cnt]);
1162 return ret; 1252 return ret;
1163} 1253}
1164 1254
1165/* 1255/*
1166 * Release all quotas referenced by inode 1256 * Release all quotas referenced by inode
1167 * Transaction must be started at an entry
1168 */ 1257 */
1169int dquot_drop(struct inode *inode) 1258int dquot_drop(struct inode *inode)
1170{ 1259{
1171 int cnt; 1260 int cnt;
1261 struct dquot *put[MAXQUOTAS];
1172 1262
1173 down_write(&sb_dqopt(inode->i_sb)->dqptr_sem); 1263 down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
1174 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1264 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1175 if (inode->i_dquot[cnt] != NODQUOT) { 1265 put[cnt] = inode->i_dquot[cnt];
1176 dqput(inode->i_dquot[cnt]); 1266 inode->i_dquot[cnt] = NODQUOT;
1177 inode->i_dquot[cnt] = NODQUOT;
1178 }
1179 } 1267 }
1180 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); 1268 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
1269
1270 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
1271 dqput(put[cnt]);
1181 return 0; 1272 return 0;
1182} 1273}
1183 1274
@@ -1264,7 +1355,7 @@ warn_put_all:
1264/* 1355/*
1265 * This operation can block, but only after everything is updated 1356 * This operation can block, but only after everything is updated
1266 */ 1357 */
1267int dquot_alloc_inode(const struct inode *inode, unsigned long number) 1358int dquot_alloc_inode(const struct inode *inode, qsize_t number)
1268{ 1359{
1269 int cnt, ret = NO_QUOTA; 1360 int cnt, ret = NO_QUOTA;
1270 char warntype[MAXQUOTAS]; 1361 char warntype[MAXQUOTAS];
@@ -1349,7 +1440,7 @@ out_sub:
1349/* 1440/*
1350 * This operation can block, but only after everything is updated 1441 * This operation can block, but only after everything is updated
1351 */ 1442 */
1352int dquot_free_inode(const struct inode *inode, unsigned long number) 1443int dquot_free_inode(const struct inode *inode, qsize_t number)
1353{ 1444{
1354 unsigned int cnt; 1445 unsigned int cnt;
1355 char warntype[MAXQUOTAS]; 1446 char warntype[MAXQUOTAS];
@@ -1393,8 +1484,9 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
1393 qsize_t space; 1484 qsize_t space;
1394 struct dquot *transfer_from[MAXQUOTAS]; 1485 struct dquot *transfer_from[MAXQUOTAS];
1395 struct dquot *transfer_to[MAXQUOTAS]; 1486 struct dquot *transfer_to[MAXQUOTAS];
1396 int cnt, ret = NO_QUOTA, chuid = (iattr->ia_valid & ATTR_UID) && inode->i_uid != iattr->ia_uid, 1487 int cnt, ret = QUOTA_OK;
1397 chgid = (iattr->ia_valid & ATTR_GID) && inode->i_gid != iattr->ia_gid; 1488 int chuid = iattr->ia_valid & ATTR_UID && inode->i_uid != iattr->ia_uid,
1489 chgid = iattr->ia_valid & ATTR_GID && inode->i_gid != iattr->ia_gid;
1398 char warntype_to[MAXQUOTAS]; 1490 char warntype_to[MAXQUOTAS];
1399 char warntype_from_inodes[MAXQUOTAS], warntype_from_space[MAXQUOTAS]; 1491 char warntype_from_inodes[MAXQUOTAS], warntype_from_space[MAXQUOTAS];
1400 1492
@@ -1402,21 +1494,11 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
1402 * re-enter the quota code and are already holding the mutex */ 1494 * re-enter the quota code and are already holding the mutex */
1403 if (IS_NOQUOTA(inode)) 1495 if (IS_NOQUOTA(inode))
1404 return QUOTA_OK; 1496 return QUOTA_OK;
1405 /* Clear the arrays */ 1497 /* Initialize the arrays */
1406 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1498 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1407 transfer_to[cnt] = transfer_from[cnt] = NODQUOT; 1499 transfer_from[cnt] = NODQUOT;
1500 transfer_to[cnt] = NODQUOT;
1408 warntype_to[cnt] = QUOTA_NL_NOWARN; 1501 warntype_to[cnt] = QUOTA_NL_NOWARN;
1409 }
1410 down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
1411 /* Now recheck reliably when holding dqptr_sem */
1412 if (IS_NOQUOTA(inode)) { /* File without quota accounting? */
1413 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
1414 return QUOTA_OK;
1415 }
1416 /* First build the transfer_to list - here we can block on
1417 * reading/instantiating of dquots. We know that the transaction for
1418 * us was already started so we don't violate lock ranking here */
1419 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1420 switch (cnt) { 1502 switch (cnt) {
1421 case USRQUOTA: 1503 case USRQUOTA:
1422 if (!chuid) 1504 if (!chuid)
@@ -1430,6 +1512,13 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
1430 break; 1512 break;
1431 } 1513 }
1432 } 1514 }
1515
1516 down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
1517 /* Now recheck reliably when holding dqptr_sem */
1518 if (IS_NOQUOTA(inode)) { /* File without quota accounting? */
1519 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
1520 goto put_all;
1521 }
1433 spin_lock(&dq_data_lock); 1522 spin_lock(&dq_data_lock);
1434 space = inode_get_bytes(inode); 1523 space = inode_get_bytes(inode);
1435 /* Build the transfer_from list and check the limits */ 1524 /* Build the transfer_from list and check the limits */
@@ -1440,7 +1529,7 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
1440 if (check_idq(transfer_to[cnt], 1, warntype_to + cnt) == 1529 if (check_idq(transfer_to[cnt], 1, warntype_to + cnt) ==
1441 NO_QUOTA || check_bdq(transfer_to[cnt], space, 0, 1530 NO_QUOTA || check_bdq(transfer_to[cnt], space, 0,
1442 warntype_to + cnt) == NO_QUOTA) 1531 warntype_to + cnt) == NO_QUOTA)
1443 goto warn_put_all; 1532 goto over_quota;
1444 } 1533 }
1445 1534
1446 /* 1535 /*
@@ -1468,34 +1557,43 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
1468 1557
1469 inode->i_dquot[cnt] = transfer_to[cnt]; 1558 inode->i_dquot[cnt] = transfer_to[cnt];
1470 } 1559 }
1471 ret = QUOTA_OK;
1472warn_put_all:
1473 spin_unlock(&dq_data_lock); 1560 spin_unlock(&dq_data_lock);
1561 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
1562
1474 /* Dirtify all the dquots - this can block when journalling */ 1563 /* Dirtify all the dquots - this can block when journalling */
1475 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1564 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1476 if (transfer_from[cnt]) 1565 if (transfer_from[cnt])
1477 mark_dquot_dirty(transfer_from[cnt]); 1566 mark_dquot_dirty(transfer_from[cnt]);
1478 if (transfer_to[cnt]) 1567 if (transfer_to[cnt]) {
1479 mark_dquot_dirty(transfer_to[cnt]); 1568 mark_dquot_dirty(transfer_to[cnt]);
1569 /* The reference we got is transferred to the inode */
1570 transfer_to[cnt] = NODQUOT;
1571 }
1480 } 1572 }
1573warn_put_all:
1481 flush_warnings(transfer_to, warntype_to); 1574 flush_warnings(transfer_to, warntype_to);
1482 flush_warnings(transfer_from, warntype_from_inodes); 1575 flush_warnings(transfer_from, warntype_from_inodes);
1483 flush_warnings(transfer_from, warntype_from_space); 1576 flush_warnings(transfer_from, warntype_from_space);
1484 1577put_all:
1485 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1578 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1486 if (ret == QUOTA_OK && transfer_from[cnt] != NODQUOT) 1579 dqput(transfer_from[cnt]);
1487 dqput(transfer_from[cnt]); 1580 dqput(transfer_to[cnt]);
1488 if (ret == NO_QUOTA && transfer_to[cnt] != NODQUOT)
1489 dqput(transfer_to[cnt]);
1490 } 1581 }
1491 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
1492 return ret; 1582 return ret;
1583over_quota:
1584 spin_unlock(&dq_data_lock);
1585 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
1586 /* Clear dquot pointers we don't want to dqput() */
1587 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
1588 transfer_from[cnt] = NODQUOT;
1589 ret = NO_QUOTA;
1590 goto warn_put_all;
1493} 1591}
1494 1592
1495/* Wrapper for transferring ownership of an inode */ 1593/* Wrapper for transferring ownership of an inode */
1496int vfs_dq_transfer(struct inode *inode, struct iattr *iattr) 1594int vfs_dq_transfer(struct inode *inode, struct iattr *iattr)
1497{ 1595{
1498 if (sb_any_quota_enabled(inode->i_sb) && !IS_NOQUOTA(inode)) { 1596 if (sb_any_quota_active(inode->i_sb) && !IS_NOQUOTA(inode)) {
1499 vfs_dq_init(inode); 1597 vfs_dq_init(inode);
1500 if (inode->i_sb->dq_op->transfer(inode, iattr) == NO_QUOTA) 1598 if (inode->i_sb->dq_op->transfer(inode, iattr) == NO_QUOTA)
1501 return 1; 1599 return 1;
@@ -1533,54 +1631,27 @@ struct dquot_operations dquot_operations = {
1533 .acquire_dquot = dquot_acquire, 1631 .acquire_dquot = dquot_acquire,
1534 .release_dquot = dquot_release, 1632 .release_dquot = dquot_release,
1535 .mark_dirty = dquot_mark_dquot_dirty, 1633 .mark_dirty = dquot_mark_dquot_dirty,
1536 .write_info = dquot_commit_info 1634 .write_info = dquot_commit_info,
1635 .alloc_dquot = dquot_alloc,
1636 .destroy_dquot = dquot_destroy,
1537}; 1637};
1538 1638
1539static inline void set_enable_flags(struct quota_info *dqopt, int type)
1540{
1541 switch (type) {
1542 case USRQUOTA:
1543 dqopt->flags |= DQUOT_USR_ENABLED;
1544 dqopt->flags &= ~DQUOT_USR_SUSPENDED;
1545 break;
1546 case GRPQUOTA:
1547 dqopt->flags |= DQUOT_GRP_ENABLED;
1548 dqopt->flags &= ~DQUOT_GRP_SUSPENDED;
1549 break;
1550 }
1551}
1552
1553static inline void reset_enable_flags(struct quota_info *dqopt, int type,
1554 int remount)
1555{
1556 switch (type) {
1557 case USRQUOTA:
1558 dqopt->flags &= ~DQUOT_USR_ENABLED;
1559 if (remount)
1560 dqopt->flags |= DQUOT_USR_SUSPENDED;
1561 else
1562 dqopt->flags &= ~DQUOT_USR_SUSPENDED;
1563 break;
1564 case GRPQUOTA:
1565 dqopt->flags &= ~DQUOT_GRP_ENABLED;
1566 if (remount)
1567 dqopt->flags |= DQUOT_GRP_SUSPENDED;
1568 else
1569 dqopt->flags &= ~DQUOT_GRP_SUSPENDED;
1570 break;
1571 }
1572}
1573
1574
1575/* 1639/*
1576 * Turn quota off on a device. type == -1 ==> quotaoff for all types (umount) 1640 * Turn quota off on a device. type == -1 ==> quotaoff for all types (umount)
1577 */ 1641 */
1578int vfs_quota_off(struct super_block *sb, int type, int remount) 1642int vfs_quota_disable(struct super_block *sb, int type, unsigned int flags)
1579{ 1643{
1580 int cnt, ret = 0; 1644 int cnt, ret = 0;
1581 struct quota_info *dqopt = sb_dqopt(sb); 1645 struct quota_info *dqopt = sb_dqopt(sb);
1582 struct inode *toputinode[MAXQUOTAS]; 1646 struct inode *toputinode[MAXQUOTAS];
1583 1647
1648 /* Cannot turn off usage accounting without turning off limits, or
1649 * suspend quotas and simultaneously turn quotas off. */
1650 if ((flags & DQUOT_USAGE_ENABLED && !(flags & DQUOT_LIMITS_ENABLED))
1651 || (flags & DQUOT_SUSPENDED && flags & (DQUOT_LIMITS_ENABLED |
1652 DQUOT_USAGE_ENABLED)))
1653 return -EINVAL;
1654
1584 /* We need to serialize quota_off() for device */ 1655 /* We need to serialize quota_off() for device */
1585 mutex_lock(&dqopt->dqonoff_mutex); 1656 mutex_lock(&dqopt->dqonoff_mutex);
1586 1657
@@ -1589,7 +1660,7 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
1589 * sometimes we are called when fill_super() failed and calling 1660 * sometimes we are called when fill_super() failed and calling
1590 * sync_fs() in such cases does no good. 1661 * sync_fs() in such cases does no good.
1591 */ 1662 */
1592 if (!sb_any_quota_enabled(sb) && !sb_any_quota_suspended(sb)) { 1663 if (!sb_any_quota_loaded(sb)) {
1593 mutex_unlock(&dqopt->dqonoff_mutex); 1664 mutex_unlock(&dqopt->dqonoff_mutex);
1594 return 0; 1665 return 0;
1595 } 1666 }
@@ -1597,17 +1668,33 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
1597 toputinode[cnt] = NULL; 1668 toputinode[cnt] = NULL;
1598 if (type != -1 && cnt != type) 1669 if (type != -1 && cnt != type)
1599 continue; 1670 continue;
1600 /* If we keep inodes of quota files after remount and quotaoff 1671 if (!sb_has_quota_loaded(sb, cnt))
1601 * is called, drop kept inodes. */
1602 if (!remount && sb_has_quota_suspended(sb, cnt)) {
1603 iput(dqopt->files[cnt]);
1604 dqopt->files[cnt] = NULL;
1605 reset_enable_flags(dqopt, cnt, 0);
1606 continue; 1672 continue;
1673
1674 if (flags & DQUOT_SUSPENDED) {
1675 spin_lock(&dq_state_lock);
1676 dqopt->flags |=
1677 dquot_state_flag(DQUOT_SUSPENDED, cnt);
1678 spin_unlock(&dq_state_lock);
1679 } else {
1680 spin_lock(&dq_state_lock);
1681 dqopt->flags &= ~dquot_state_flag(flags, cnt);
1682 /* Turning off suspended quotas? */
1683 if (!sb_has_quota_loaded(sb, cnt) &&
1684 sb_has_quota_suspended(sb, cnt)) {
1685 dqopt->flags &= ~dquot_state_flag(
1686 DQUOT_SUSPENDED, cnt);
1687 spin_unlock(&dq_state_lock);
1688 iput(dqopt->files[cnt]);
1689 dqopt->files[cnt] = NULL;
1690 continue;
1691 }
1692 spin_unlock(&dq_state_lock);
1607 } 1693 }
1608 if (!sb_has_quota_enabled(sb, cnt)) 1694
1695 /* We still have to keep quota loaded? */
1696 if (sb_has_quota_loaded(sb, cnt) && !(flags & DQUOT_SUSPENDED))
1609 continue; 1697 continue;
1610 reset_enable_flags(dqopt, cnt, remount);
1611 1698
1612 /* Note: these are blocking operations */ 1699 /* Note: these are blocking operations */
1613 drop_dquot_ref(sb, cnt); 1700 drop_dquot_ref(sb, cnt);
@@ -1623,7 +1710,7 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
1623 put_quota_format(dqopt->info[cnt].dqi_format); 1710 put_quota_format(dqopt->info[cnt].dqi_format);
1624 1711
1625 toputinode[cnt] = dqopt->files[cnt]; 1712 toputinode[cnt] = dqopt->files[cnt];
1626 if (!remount) 1713 if (!sb_has_quota_loaded(sb, cnt))
1627 dqopt->files[cnt] = NULL; 1714 dqopt->files[cnt] = NULL;
1628 dqopt->info[cnt].dqi_flags = 0; 1715 dqopt->info[cnt].dqi_flags = 0;
1629 dqopt->info[cnt].dqi_igrace = 0; 1716 dqopt->info[cnt].dqi_igrace = 0;
@@ -1631,6 +1718,11 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
1631 dqopt->ops[cnt] = NULL; 1718 dqopt->ops[cnt] = NULL;
1632 } 1719 }
1633 mutex_unlock(&dqopt->dqonoff_mutex); 1720 mutex_unlock(&dqopt->dqonoff_mutex);
1721
1722 /* Skip syncing and setting flags if quota files are hidden */
1723 if (dqopt->flags & DQUOT_QUOTA_SYS_FILE)
1724 goto put_inodes;
1725
1634 /* Sync the superblock so that buffers with quota data are written to 1726 /* Sync the superblock so that buffers with quota data are written to
1635 * disk (and so userspace sees correct data afterwards). */ 1727 * disk (and so userspace sees correct data afterwards). */
1636 if (sb->s_op->sync_fs) 1728 if (sb->s_op->sync_fs)
@@ -1646,7 +1738,7 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
1646 mutex_lock(&dqopt->dqonoff_mutex); 1738 mutex_lock(&dqopt->dqonoff_mutex);
1647 /* If quota was reenabled in the meantime, we have 1739 /* If quota was reenabled in the meantime, we have
1648 * nothing to do */ 1740 * nothing to do */
1649 if (!sb_has_quota_enabled(sb, cnt)) { 1741 if (!sb_has_quota_loaded(sb, cnt)) {
1650 mutex_lock_nested(&toputinode[cnt]->i_mutex, I_MUTEX_QUOTA); 1742 mutex_lock_nested(&toputinode[cnt]->i_mutex, I_MUTEX_QUOTA);
1651 toputinode[cnt]->i_flags &= ~(S_IMMUTABLE | 1743 toputinode[cnt]->i_flags &= ~(S_IMMUTABLE |
1652 S_NOATIME | S_NOQUOTA); 1744 S_NOATIME | S_NOQUOTA);
@@ -1655,26 +1747,43 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
1655 mark_inode_dirty(toputinode[cnt]); 1747 mark_inode_dirty(toputinode[cnt]);
1656 } 1748 }
1657 mutex_unlock(&dqopt->dqonoff_mutex); 1749 mutex_unlock(&dqopt->dqonoff_mutex);
1750 }
1751 if (sb->s_bdev)
1752 invalidate_bdev(sb->s_bdev);
1753put_inodes:
1754 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
1755 if (toputinode[cnt]) {
1658 /* On remount RO, we keep the inode pointer so that we 1756 /* On remount RO, we keep the inode pointer so that we
1659 * can reenable quota on the subsequent remount RW. 1757 * can reenable quota on the subsequent remount RW. We
1660 * But we have better not keep inode pointer when there 1758 * have to check 'flags' variable and not use sb_has_
1661 * is pending delete on the quota file... */ 1759 * function because another quotaon / quotaoff could
1662 if (!remount) 1760 * change global state before we got here. We refuse
1761 * to suspend quotas when there is pending delete on
1762 * the quota file... */
1763 if (!(flags & DQUOT_SUSPENDED))
1663 iput(toputinode[cnt]); 1764 iput(toputinode[cnt]);
1664 else if (!toputinode[cnt]->i_nlink) 1765 else if (!toputinode[cnt]->i_nlink)
1665 ret = -EBUSY; 1766 ret = -EBUSY;
1666 } 1767 }
1667 if (sb->s_bdev)
1668 invalidate_bdev(sb->s_bdev);
1669 return ret; 1768 return ret;
1670} 1769}
1671 1770
1771int vfs_quota_off(struct super_block *sb, int type, int remount)
1772{
1773 return vfs_quota_disable(sb, type, remount ? DQUOT_SUSPENDED :
1774 (DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED));
1775}
1776
1672/* 1777/*
1673 * Turn quotas on on a device 1778 * Turn quotas on on a device
1674 */ 1779 */
1675 1780
1676/* Helper function when we already have the inode */ 1781/*
1677static int vfs_quota_on_inode(struct inode *inode, int type, int format_id) 1782 * Helper function to turn quotas on when we already have the inode of
1783 * quota file and no quota information is loaded.
1784 */
1785static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
1786 unsigned int flags)
1678{ 1787{
1679 struct quota_format_type *fmt = find_quota_format(format_id); 1788 struct quota_format_type *fmt = find_quota_format(format_id);
1680 struct super_block *sb = inode->i_sb; 1789 struct super_block *sb = inode->i_sb;
@@ -1696,27 +1805,37 @@ static int vfs_quota_on_inode(struct inode *inode, int type, int format_id)
1696 error = -EINVAL; 1805 error = -EINVAL;
1697 goto out_fmt; 1806 goto out_fmt;
1698 } 1807 }
1808 /* Usage always has to be set... */
1809 if (!(flags & DQUOT_USAGE_ENABLED)) {
1810 error = -EINVAL;
1811 goto out_fmt;
1812 }
1699 1813
1700 /* As we bypass the pagecache we must now flush the inode so that 1814 if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) {
1701 * we see all the changes from userspace... */ 1815 /* As we bypass the pagecache we must now flush the inode so
1702 write_inode_now(inode, 1); 1816 * that we see all the changes from userspace... */
1703 /* And now flush the block cache so that kernel sees the changes */ 1817 write_inode_now(inode, 1);
1704 invalidate_bdev(sb->s_bdev); 1818 /* And now flush the block cache so that kernel sees the
1819 * changes */
1820 invalidate_bdev(sb->s_bdev);
1821 }
1705 mutex_lock(&inode->i_mutex); 1822 mutex_lock(&inode->i_mutex);
1706 mutex_lock(&dqopt->dqonoff_mutex); 1823 mutex_lock(&dqopt->dqonoff_mutex);
1707 if (sb_has_quota_enabled(sb, type) || 1824 if (sb_has_quota_loaded(sb, type)) {
1708 sb_has_quota_suspended(sb, type)) {
1709 error = -EBUSY; 1825 error = -EBUSY;
1710 goto out_lock; 1826 goto out_lock;
1711 } 1827 }
1712 /* We don't want quota and atime on quota files (deadlocks possible) 1828
1713 * Also nobody should write to the file - we use special IO operations 1829 if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) {
1714 * which ignore the immutable bit. */ 1830 /* We don't want quota and atime on quota files (deadlocks
1715 down_write(&dqopt->dqptr_sem); 1831 * possible) Also nobody should write to the file - we use
1716 oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE | S_NOQUOTA); 1832 * special IO operations which ignore the immutable bit. */
1717 inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE; 1833 down_write(&dqopt->dqptr_sem);
1718 up_write(&dqopt->dqptr_sem); 1834 oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE | S_NOQUOTA);
1719 sb->dq_op->drop(inode); 1835 inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE;
1836 up_write(&dqopt->dqptr_sem);
1837 sb->dq_op->drop(inode);
1838 }
1720 1839
1721 error = -EIO; 1840 error = -EIO;
1722 dqopt->files[type] = igrab(inode); 1841 dqopt->files[type] = igrab(inode);
@@ -1737,7 +1856,9 @@ static int vfs_quota_on_inode(struct inode *inode, int type, int format_id)
1737 } 1856 }
1738 mutex_unlock(&dqopt->dqio_mutex); 1857 mutex_unlock(&dqopt->dqio_mutex);
1739 mutex_unlock(&inode->i_mutex); 1858 mutex_unlock(&inode->i_mutex);
1740 set_enable_flags(dqopt, type); 1859 spin_lock(&dq_state_lock);
1860 dqopt->flags |= dquot_state_flag(flags, type);
1861 spin_unlock(&dq_state_lock);
1741 1862
1742 add_dquot_ref(sb, type); 1863 add_dquot_ref(sb, type);
1743 mutex_unlock(&dqopt->dqonoff_mutex); 1864 mutex_unlock(&dqopt->dqonoff_mutex);
@@ -1770,20 +1891,25 @@ static int vfs_quota_on_remount(struct super_block *sb, int type)
1770 struct quota_info *dqopt = sb_dqopt(sb); 1891 struct quota_info *dqopt = sb_dqopt(sb);
1771 struct inode *inode; 1892 struct inode *inode;
1772 int ret; 1893 int ret;
1894 unsigned int flags;
1773 1895
1774 mutex_lock(&dqopt->dqonoff_mutex); 1896 mutex_lock(&dqopt->dqonoff_mutex);
1775 if (!sb_has_quota_suspended(sb, type)) { 1897 if (!sb_has_quota_suspended(sb, type)) {
1776 mutex_unlock(&dqopt->dqonoff_mutex); 1898 mutex_unlock(&dqopt->dqonoff_mutex);
1777 return 0; 1899 return 0;
1778 } 1900 }
1779 BUG_ON(sb_has_quota_enabled(sb, type));
1780
1781 inode = dqopt->files[type]; 1901 inode = dqopt->files[type];
1782 dqopt->files[type] = NULL; 1902 dqopt->files[type] = NULL;
1783 reset_enable_flags(dqopt, type, 0); 1903 spin_lock(&dq_state_lock);
1904 flags = dqopt->flags & dquot_state_flag(DQUOT_USAGE_ENABLED |
1905 DQUOT_LIMITS_ENABLED, type);
1906 dqopt->flags &= ~dquot_state_flag(DQUOT_STATE_FLAGS, type);
1907 spin_unlock(&dq_state_lock);
1784 mutex_unlock(&dqopt->dqonoff_mutex); 1908 mutex_unlock(&dqopt->dqonoff_mutex);
1785 1909
1786 ret = vfs_quota_on_inode(inode, type, dqopt->info[type].dqi_fmt_id); 1910 flags = dquot_generic_flag(flags, type);
1911 ret = vfs_load_quota_inode(inode, type, dqopt->info[type].dqi_fmt_id,
1912 flags);
1787 iput(inode); 1913 iput(inode);
1788 1914
1789 return ret; 1915 return ret;
@@ -1799,12 +1925,12 @@ int vfs_quota_on_path(struct super_block *sb, int type, int format_id,
1799 if (path->mnt->mnt_sb != sb) 1925 if (path->mnt->mnt_sb != sb)
1800 error = -EXDEV; 1926 error = -EXDEV;
1801 else 1927 else
1802 error = vfs_quota_on_inode(path->dentry->d_inode, type, 1928 error = vfs_load_quota_inode(path->dentry->d_inode, type,
1803 format_id); 1929 format_id, DQUOT_USAGE_ENABLED |
1930 DQUOT_LIMITS_ENABLED);
1804 return error; 1931 return error;
1805} 1932}
1806 1933
1807/* Actual function called from quotactl() */
1808int vfs_quota_on(struct super_block *sb, int type, int format_id, char *name, 1934int vfs_quota_on(struct super_block *sb, int type, int format_id, char *name,
1809 int remount) 1935 int remount)
1810{ 1936{
@@ -1823,6 +1949,52 @@ int vfs_quota_on(struct super_block *sb, int type, int format_id, char *name,
1823} 1949}
1824 1950
1825/* 1951/*
1952 * More powerful function for turning on quotas allowing setting
1953 * of individual quota flags
1954 */
1955int vfs_quota_enable(struct inode *inode, int type, int format_id,
1956 unsigned int flags)
1957{
1958 int ret = 0;
1959 struct super_block *sb = inode->i_sb;
1960 struct quota_info *dqopt = sb_dqopt(sb);
1961
1962 /* Just unsuspend quotas? */
1963 if (flags & DQUOT_SUSPENDED)
1964 return vfs_quota_on_remount(sb, type);
1965 if (!flags)
1966 return 0;
1967 /* Just updating flags needed? */
1968 if (sb_has_quota_loaded(sb, type)) {
1969 mutex_lock(&dqopt->dqonoff_mutex);
1970 /* Now do a reliable test... */
1971 if (!sb_has_quota_loaded(sb, type)) {
1972 mutex_unlock(&dqopt->dqonoff_mutex);
1973 goto load_quota;
1974 }
1975 if (flags & DQUOT_USAGE_ENABLED &&
1976 sb_has_quota_usage_enabled(sb, type)) {
1977 ret = -EBUSY;
1978 goto out_lock;
1979 }
1980 if (flags & DQUOT_LIMITS_ENABLED &&
1981 sb_has_quota_limits_enabled(sb, type)) {
1982 ret = -EBUSY;
1983 goto out_lock;
1984 }
1985 spin_lock(&dq_state_lock);
1986 sb_dqopt(sb)->flags |= dquot_state_flag(flags, type);
1987 spin_unlock(&dq_state_lock);
1988out_lock:
1989 mutex_unlock(&dqopt->dqonoff_mutex);
1990 return ret;
1991 }
1992
1993load_quota:
1994 return vfs_load_quota_inode(inode, type, format_id, flags);
1995}
1996
1997/*
1826 * This function is used when filesystem needs to initialize quotas 1998 * This function is used when filesystem needs to initialize quotas
1827 * during mount time. 1999 * during mount time.
1828 */ 2000 */
@@ -1843,7 +2015,8 @@ int vfs_quota_on_mount(struct super_block *sb, char *qf_name,
1843 2015
1844 error = security_quota_on(dentry); 2016 error = security_quota_on(dentry);
1845 if (!error) 2017 if (!error)
1846 error = vfs_quota_on_inode(dentry->d_inode, type, format_id); 2018 error = vfs_load_quota_inode(dentry->d_inode, type, format_id,
2019 DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
1847 2020
1848out: 2021out:
1849 dput(dentry); 2022 dput(dentry);
@@ -1866,14 +2039,24 @@ int vfs_dq_quota_on_remount(struct super_block *sb)
1866 return ret; 2039 return ret;
1867} 2040}
1868 2041
2042static inline qsize_t qbtos(qsize_t blocks)
2043{
2044 return blocks << QIF_DQBLKSIZE_BITS;
2045}
2046
2047static inline qsize_t stoqb(qsize_t space)
2048{
2049 return (space + QIF_DQBLKSIZE - 1) >> QIF_DQBLKSIZE_BITS;
2050}
2051
1869/* Generic routine for getting common part of quota structure */ 2052/* Generic routine for getting common part of quota structure */
1870static void do_get_dqblk(struct dquot *dquot, struct if_dqblk *di) 2053static void do_get_dqblk(struct dquot *dquot, struct if_dqblk *di)
1871{ 2054{
1872 struct mem_dqblk *dm = &dquot->dq_dqb; 2055 struct mem_dqblk *dm = &dquot->dq_dqb;
1873 2056
1874 spin_lock(&dq_data_lock); 2057 spin_lock(&dq_data_lock);
1875 di->dqb_bhardlimit = dm->dqb_bhardlimit; 2058 di->dqb_bhardlimit = stoqb(dm->dqb_bhardlimit);
1876 di->dqb_bsoftlimit = dm->dqb_bsoftlimit; 2059 di->dqb_bsoftlimit = stoqb(dm->dqb_bsoftlimit);
1877 di->dqb_curspace = dm->dqb_curspace; 2060 di->dqb_curspace = dm->dqb_curspace;
1878 di->dqb_ihardlimit = dm->dqb_ihardlimit; 2061 di->dqb_ihardlimit = dm->dqb_ihardlimit;
1879 di->dqb_isoftlimit = dm->dqb_isoftlimit; 2062 di->dqb_isoftlimit = dm->dqb_isoftlimit;
@@ -1888,14 +2071,12 @@ int vfs_get_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *d
1888{ 2071{
1889 struct dquot *dquot; 2072 struct dquot *dquot;
1890 2073
1891 mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); 2074 dquot = dqget(sb, id, type);
1892 if (!(dquot = dqget(sb, id, type))) { 2075 if (dquot == NODQUOT)
1893 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
1894 return -ESRCH; 2076 return -ESRCH;
1895 }
1896 do_get_dqblk(dquot, di); 2077 do_get_dqblk(dquot, di);
1897 dqput(dquot); 2078 dqput(dquot);
1898 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); 2079
1899 return 0; 2080 return 0;
1900} 2081}
1901 2082
@@ -1918,28 +2099,38 @@ static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
1918 if (di->dqb_valid & QIF_SPACE) { 2099 if (di->dqb_valid & QIF_SPACE) {
1919 dm->dqb_curspace = di->dqb_curspace; 2100 dm->dqb_curspace = di->dqb_curspace;
1920 check_blim = 1; 2101 check_blim = 1;
2102 __set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags);
1921 } 2103 }
1922 if (di->dqb_valid & QIF_BLIMITS) { 2104 if (di->dqb_valid & QIF_BLIMITS) {
1923 dm->dqb_bsoftlimit = di->dqb_bsoftlimit; 2105 dm->dqb_bsoftlimit = qbtos(di->dqb_bsoftlimit);
1924 dm->dqb_bhardlimit = di->dqb_bhardlimit; 2106 dm->dqb_bhardlimit = qbtos(di->dqb_bhardlimit);
1925 check_blim = 1; 2107 check_blim = 1;
2108 __set_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags);
1926 } 2109 }
1927 if (di->dqb_valid & QIF_INODES) { 2110 if (di->dqb_valid & QIF_INODES) {
1928 dm->dqb_curinodes = di->dqb_curinodes; 2111 dm->dqb_curinodes = di->dqb_curinodes;
1929 check_ilim = 1; 2112 check_ilim = 1;
2113 __set_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags);
1930 } 2114 }
1931 if (di->dqb_valid & QIF_ILIMITS) { 2115 if (di->dqb_valid & QIF_ILIMITS) {
1932 dm->dqb_isoftlimit = di->dqb_isoftlimit; 2116 dm->dqb_isoftlimit = di->dqb_isoftlimit;
1933 dm->dqb_ihardlimit = di->dqb_ihardlimit; 2117 dm->dqb_ihardlimit = di->dqb_ihardlimit;
1934 check_ilim = 1; 2118 check_ilim = 1;
2119 __set_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags);
1935 } 2120 }
1936 if (di->dqb_valid & QIF_BTIME) 2121 if (di->dqb_valid & QIF_BTIME) {
1937 dm->dqb_btime = di->dqb_btime; 2122 dm->dqb_btime = di->dqb_btime;
1938 if (di->dqb_valid & QIF_ITIME) 2123 check_blim = 1;
2124 __set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags);
2125 }
2126 if (di->dqb_valid & QIF_ITIME) {
1939 dm->dqb_itime = di->dqb_itime; 2127 dm->dqb_itime = di->dqb_itime;
2128 check_ilim = 1;
2129 __set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags);
2130 }
1940 2131
1941 if (check_blim) { 2132 if (check_blim) {
1942 if (!dm->dqb_bsoftlimit || toqb(dm->dqb_curspace) < dm->dqb_bsoftlimit) { 2133 if (!dm->dqb_bsoftlimit || dm->dqb_curspace < dm->dqb_bsoftlimit) {
1943 dm->dqb_btime = 0; 2134 dm->dqb_btime = 0;
1944 clear_bit(DQ_BLKS_B, &dquot->dq_flags); 2135 clear_bit(DQ_BLKS_B, &dquot->dq_flags);
1945 } 2136 }
@@ -1969,14 +2160,14 @@ int vfs_set_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *d
1969 struct dquot *dquot; 2160 struct dquot *dquot;
1970 int rc; 2161 int rc;
1971 2162
1972 mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); 2163 dquot = dqget(sb, id, type);
1973 if (!(dquot = dqget(sb, id, type))) { 2164 if (!dquot) {
1974 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); 2165 rc = -ESRCH;
1975 return -ESRCH; 2166 goto out;
1976 } 2167 }
1977 rc = do_set_dqblk(dquot, di); 2168 rc = do_set_dqblk(dquot, di);
1978 dqput(dquot); 2169 dqput(dquot);
1979 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); 2170out:
1980 return rc; 2171 return rc;
1981} 2172}
1982 2173
@@ -1986,7 +2177,7 @@ int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
1986 struct mem_dqinfo *mi; 2177 struct mem_dqinfo *mi;
1987 2178
1988 mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); 2179 mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
1989 if (!sb_has_quota_enabled(sb, type)) { 2180 if (!sb_has_quota_active(sb, type)) {
1990 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); 2181 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
1991 return -ESRCH; 2182 return -ESRCH;
1992 } 2183 }
@@ -2005,11 +2196,12 @@ int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
2005int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii) 2196int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
2006{ 2197{
2007 struct mem_dqinfo *mi; 2198 struct mem_dqinfo *mi;
2199 int err = 0;
2008 2200
2009 mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); 2201 mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
2010 if (!sb_has_quota_enabled(sb, type)) { 2202 if (!sb_has_quota_active(sb, type)) {
2011 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); 2203 err = -ESRCH;
2012 return -ESRCH; 2204 goto out;
2013 } 2205 }
2014 mi = sb_dqopt(sb)->info + type; 2206 mi = sb_dqopt(sb)->info + type;
2015 spin_lock(&dq_data_lock); 2207 spin_lock(&dq_data_lock);
@@ -2023,8 +2215,9 @@ int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
2023 mark_info_dirty(sb, type); 2215 mark_info_dirty(sb, type);
2024 /* Force write to disk */ 2216 /* Force write to disk */
2025 sb->dq_op->write_info(sb, type); 2217 sb->dq_op->write_info(sb, type);
2218out:
2026 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); 2219 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
2027 return 0; 2220 return err;
2028} 2221}
2029 2222
2030struct quotactl_ops vfs_quotactl_ops = { 2223struct quotactl_ops vfs_quotactl_ops = {
@@ -2186,10 +2379,13 @@ EXPORT_SYMBOL(register_quota_format);
2186EXPORT_SYMBOL(unregister_quota_format); 2379EXPORT_SYMBOL(unregister_quota_format);
2187EXPORT_SYMBOL(dqstats); 2380EXPORT_SYMBOL(dqstats);
2188EXPORT_SYMBOL(dq_data_lock); 2381EXPORT_SYMBOL(dq_data_lock);
2382EXPORT_SYMBOL(vfs_quota_enable);
2189EXPORT_SYMBOL(vfs_quota_on); 2383EXPORT_SYMBOL(vfs_quota_on);
2190EXPORT_SYMBOL(vfs_quota_on_path); 2384EXPORT_SYMBOL(vfs_quota_on_path);
2191EXPORT_SYMBOL(vfs_quota_on_mount); 2385EXPORT_SYMBOL(vfs_quota_on_mount);
2386EXPORT_SYMBOL(vfs_quota_disable);
2192EXPORT_SYMBOL(vfs_quota_off); 2387EXPORT_SYMBOL(vfs_quota_off);
2388EXPORT_SYMBOL(dquot_scan_active);
2193EXPORT_SYMBOL(vfs_quota_sync); 2389EXPORT_SYMBOL(vfs_quota_sync);
2194EXPORT_SYMBOL(vfs_get_dqinfo); 2390EXPORT_SYMBOL(vfs_get_dqinfo);
2195EXPORT_SYMBOL(vfs_set_dqinfo); 2391EXPORT_SYMBOL(vfs_set_dqinfo);
@@ -2203,6 +2399,8 @@ EXPORT_SYMBOL(dquot_mark_dquot_dirty);
2203EXPORT_SYMBOL(dquot_initialize); 2399EXPORT_SYMBOL(dquot_initialize);
2204EXPORT_SYMBOL(dquot_drop); 2400EXPORT_SYMBOL(dquot_drop);
2205EXPORT_SYMBOL(vfs_dq_drop); 2401EXPORT_SYMBOL(vfs_dq_drop);
2402EXPORT_SYMBOL(dqget);
2403EXPORT_SYMBOL(dqput);
2206EXPORT_SYMBOL(dquot_alloc_space); 2404EXPORT_SYMBOL(dquot_alloc_space);
2207EXPORT_SYMBOL(dquot_alloc_inode); 2405EXPORT_SYMBOL(dquot_alloc_inode);
2208EXPORT_SYMBOL(dquot_free_space); 2406EXPORT_SYMBOL(dquot_free_space);
diff --git a/fs/ecryptfs/Kconfig b/fs/ecryptfs/Kconfig
new file mode 100644
index 000000000000..0c754e64232b
--- /dev/null
+++ b/fs/ecryptfs/Kconfig
@@ -0,0 +1,11 @@
1config ECRYPT_FS
2 tristate "eCrypt filesystem layer support (EXPERIMENTAL)"
3 depends on EXPERIMENTAL && KEYS && CRYPTO && NET
4 help
5 Encrypted filesystem that operates on the VFS layer. See
6 <file:Documentation/filesystems/ecryptfs.txt> to learn more about
7 eCryptfs. Userspace components are required and can be
8 obtained from <http://ecryptfs.sf.net>.
9
10 To compile this file system support as a module, choose M here: the
11 module will be called ecryptfs.
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 6046239465a1..c01e043670e2 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -175,8 +175,8 @@ out:
175 * 175 *
176 * Returns zero on success; non-zero on error. 176 * Returns zero on success; non-zero on error.
177 */ 177 */
178static int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat, 178int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat,
179 loff_t offset) 179 loff_t offset)
180{ 180{
181 int rc = 0; 181 int rc = 0;
182 char dst[MD5_DIGEST_SIZE]; 182 char dst[MD5_DIGEST_SIZE];
@@ -924,6 +924,15 @@ static void ecryptfs_copy_mount_wide_flags_to_inode_flags(
924 crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR; 924 crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR;
925 if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) 925 if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED)
926 crypt_stat->flags |= ECRYPTFS_VIEW_AS_ENCRYPTED; 926 crypt_stat->flags |= ECRYPTFS_VIEW_AS_ENCRYPTED;
927 if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) {
928 crypt_stat->flags |= ECRYPTFS_ENCRYPT_FILENAMES;
929 if (mount_crypt_stat->flags
930 & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK)
931 crypt_stat->flags |= ECRYPTFS_ENCFN_USE_MOUNT_FNEK;
932 else if (mount_crypt_stat->flags
933 & ECRYPTFS_GLOBAL_ENCFN_USE_FEK)
934 crypt_stat->flags |= ECRYPTFS_ENCFN_USE_FEK;
935 }
927} 936}
928 937
929static int ecryptfs_copy_mount_wide_sigs_to_inode_sigs( 938static int ecryptfs_copy_mount_wide_sigs_to_inode_sigs(
@@ -1060,7 +1069,8 @@ struct ecryptfs_flag_map_elem {
1060static struct ecryptfs_flag_map_elem ecryptfs_flag_map[] = { 1069static struct ecryptfs_flag_map_elem ecryptfs_flag_map[] = {
1061 {0x00000001, ECRYPTFS_ENABLE_HMAC}, 1070 {0x00000001, ECRYPTFS_ENABLE_HMAC},
1062 {0x00000002, ECRYPTFS_ENCRYPTED}, 1071 {0x00000002, ECRYPTFS_ENCRYPTED},
1063 {0x00000004, ECRYPTFS_METADATA_IN_XATTR} 1072 {0x00000004, ECRYPTFS_METADATA_IN_XATTR},
1073 {0x00000008, ECRYPTFS_ENCRYPT_FILENAMES}
1064}; 1074};
1065 1075
1066/** 1076/**
@@ -1149,19 +1159,20 @@ ecryptfs_cipher_code_str_map[] = {
1149 1159
1150/** 1160/**
1151 * ecryptfs_code_for_cipher_string 1161 * ecryptfs_code_for_cipher_string
1152 * @crypt_stat: The cryptographic context 1162 * @cipher_name: The string alias for the cipher
1163 * @key_bytes: Length of key in bytes; used for AES code selection
1153 * 1164 *
1154 * Returns zero on no match, or the cipher code on match 1165 * Returns zero on no match, or the cipher code on match
1155 */ 1166 */
1156u8 ecryptfs_code_for_cipher_string(struct ecryptfs_crypt_stat *crypt_stat) 1167u8 ecryptfs_code_for_cipher_string(char *cipher_name, size_t key_bytes)
1157{ 1168{
1158 int i; 1169 int i;
1159 u8 code = 0; 1170 u8 code = 0;
1160 struct ecryptfs_cipher_code_str_map_elem *map = 1171 struct ecryptfs_cipher_code_str_map_elem *map =
1161 ecryptfs_cipher_code_str_map; 1172 ecryptfs_cipher_code_str_map;
1162 1173
1163 if (strcmp(crypt_stat->cipher, "aes") == 0) { 1174 if (strcmp(cipher_name, "aes") == 0) {
1164 switch (crypt_stat->key_size) { 1175 switch (key_bytes) {
1165 case 16: 1176 case 16:
1166 code = RFC2440_CIPHER_AES_128; 1177 code = RFC2440_CIPHER_AES_128;
1167 break; 1178 break;
@@ -1173,7 +1184,7 @@ u8 ecryptfs_code_for_cipher_string(struct ecryptfs_crypt_stat *crypt_stat)
1173 } 1184 }
1174 } else { 1185 } else {
1175 for (i = 0; i < ARRAY_SIZE(ecryptfs_cipher_code_str_map); i++) 1186 for (i = 0; i < ARRAY_SIZE(ecryptfs_cipher_code_str_map); i++)
1176 if (strcmp(crypt_stat->cipher, map[i].cipher_str) == 0){ 1187 if (strcmp(cipher_name, map[i].cipher_str) == 0) {
1177 code = map[i].cipher_code; 1188 code = map[i].cipher_code;
1178 break; 1189 break;
1179 } 1190 }
@@ -1212,6 +1223,8 @@ int ecryptfs_read_and_validate_header_region(char *data,
1212 &(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat); 1223 &(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat);
1213 int rc; 1224 int rc;
1214 1225
1226 if (crypt_stat->extent_size == 0)
1227 crypt_stat->extent_size = ECRYPTFS_DEFAULT_EXTENT_SIZE;
1215 rc = ecryptfs_read_lower(data, 0, crypt_stat->extent_size, 1228 rc = ecryptfs_read_lower(data, 0, crypt_stat->extent_size,
1216 ecryptfs_inode); 1229 ecryptfs_inode);
1217 if (rc) { 1230 if (rc) {
@@ -1221,7 +1234,6 @@ int ecryptfs_read_and_validate_header_region(char *data,
1221 } 1234 }
1222 if (!contains_ecryptfs_marker(data + ECRYPTFS_FILE_SIZE_BYTES)) { 1235 if (!contains_ecryptfs_marker(data + ECRYPTFS_FILE_SIZE_BYTES)) {
1223 rc = -EINVAL; 1236 rc = -EINVAL;
1224 ecryptfs_printk(KERN_DEBUG, "Valid marker not found\n");
1225 } 1237 }
1226out: 1238out:
1227 return rc; 1239 return rc;
@@ -1628,95 +1640,95 @@ out:
1628} 1640}
1629 1641
1630/** 1642/**
1631 * ecryptfs_encode_filename - converts a plaintext file name to cipher text 1643 * ecryptfs_encrypt_filename - encrypt filename
1632 * @crypt_stat: The crypt_stat struct associated with the file anem to encode
1633 * @name: The plaintext name
1634 * @length: The length of the plaintext
1635 * @encoded_name: The encypted name
1636 * 1644 *
1637 * Encrypts and encodes a filename into something that constitutes a 1645 * CBC-encrypts the filename. We do not want to encrypt the same
1638 * valid filename for a filesystem, with printable characters. 1646 * filename with the same key and IV, which may happen with hard
1647 * links, so we prepend random bits to each filename.
1639 * 1648 *
1640 * We assume that we have a properly initialized crypto context, 1649 * Returns zero on success; non-zero otherwise
1641 * pointed to by crypt_stat->tfm.
1642 *
1643 * TODO: Implement filename decoding and decryption here, in place of
1644 * memcpy. We are keeping the framework around for now to (1)
1645 * facilitate testing of the components needed to implement filename
1646 * encryption and (2) to provide a code base from which other
1647 * developers in the community can easily implement this feature.
1648 *
1649 * Returns the length of encoded filename; negative if error
1650 */ 1650 */
1651int 1651static int
1652ecryptfs_encode_filename(struct ecryptfs_crypt_stat *crypt_stat, 1652ecryptfs_encrypt_filename(struct ecryptfs_filename *filename,
1653 const char *name, int length, char **encoded_name) 1653 struct ecryptfs_crypt_stat *crypt_stat,
1654 struct ecryptfs_mount_crypt_stat *mount_crypt_stat)
1654{ 1655{
1655 int error = 0; 1656 int rc = 0;
1656 1657
1657 (*encoded_name) = kmalloc(length + 2, GFP_KERNEL); 1658 filename->encrypted_filename = NULL;
1658 if (!(*encoded_name)) { 1659 filename->encrypted_filename_size = 0;
1659 error = -ENOMEM; 1660 if ((crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCFN_USE_MOUNT_FNEK))
1661 || (mount_crypt_stat && (mount_crypt_stat->flags
1662 & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK))) {
1663 size_t packet_size;
1664 size_t remaining_bytes;
1665
1666 rc = ecryptfs_write_tag_70_packet(
1667 NULL, NULL,
1668 &filename->encrypted_filename_size,
1669 mount_crypt_stat, NULL,
1670 filename->filename_size);
1671 if (rc) {
1672 printk(KERN_ERR "%s: Error attempting to get packet "
1673 "size for tag 72; rc = [%d]\n", __func__,
1674 rc);
1675 filename->encrypted_filename_size = 0;
1676 goto out;
1677 }
1678 filename->encrypted_filename =
1679 kmalloc(filename->encrypted_filename_size, GFP_KERNEL);
1680 if (!filename->encrypted_filename) {
1681 printk(KERN_ERR "%s: Out of memory whilst attempting "
1682 "to kmalloc [%zd] bytes\n", __func__,
1683 filename->encrypted_filename_size);
1684 rc = -ENOMEM;
1685 goto out;
1686 }
1687 remaining_bytes = filename->encrypted_filename_size;
1688 rc = ecryptfs_write_tag_70_packet(filename->encrypted_filename,
1689 &remaining_bytes,
1690 &packet_size,
1691 mount_crypt_stat,
1692 filename->filename,
1693 filename->filename_size);
1694 if (rc) {
1695 printk(KERN_ERR "%s: Error attempting to generate "
1696 "tag 70 packet; rc = [%d]\n", __func__,
1697 rc);
1698 kfree(filename->encrypted_filename);
1699 filename->encrypted_filename = NULL;
1700 filename->encrypted_filename_size = 0;
1701 goto out;
1702 }
1703 filename->encrypted_filename_size = packet_size;
1704 } else {
1705 printk(KERN_ERR "%s: No support for requested filename "
1706 "encryption method in this release\n", __func__);
1707 rc = -ENOTSUPP;
1660 goto out; 1708 goto out;
1661 } 1709 }
1662 /* TODO: Filename encryption is a scheduled feature for a
1663 * future version of eCryptfs. This function is here only for
1664 * the purpose of providing a framework for other developers
1665 * to easily implement filename encryption. Hint: Replace this
1666 * memcpy() with a call to encrypt and encode the
1667 * filename, the set the length accordingly. */
1668 memcpy((void *)(*encoded_name), (void *)name, length);
1669 (*encoded_name)[length] = '\0';
1670 error = length + 1;
1671out: 1710out:
1672 return error; 1711 return rc;
1673} 1712}
1674 1713
1675/** 1714static int ecryptfs_copy_filename(char **copied_name, size_t *copied_name_size,
1676 * ecryptfs_decode_filename - converts the cipher text name to plaintext 1715 const char *name, size_t name_size)
1677 * @crypt_stat: The crypt_stat struct associated with the file
1678 * @name: The filename in cipher text
1679 * @length: The length of the cipher text name
1680 * @decrypted_name: The plaintext name
1681 *
1682 * Decodes and decrypts the filename.
1683 *
1684 * We assume that we have a properly initialized crypto context,
1685 * pointed to by crypt_stat->tfm.
1686 *
1687 * TODO: Implement filename decoding and decryption here, in place of
1688 * memcpy. We are keeping the framework around for now to (1)
1689 * facilitate testing of the components needed to implement filename
1690 * encryption and (2) to provide a code base from which other
1691 * developers in the community can easily implement this feature.
1692 *
1693 * Returns the length of decoded filename; negative if error
1694 */
1695int
1696ecryptfs_decode_filename(struct ecryptfs_crypt_stat *crypt_stat,
1697 const char *name, int length, char **decrypted_name)
1698{ 1716{
1699 int error = 0; 1717 int rc = 0;
1700 1718
1701 (*decrypted_name) = kmalloc(length + 2, GFP_KERNEL); 1719 (*copied_name) = kmalloc((name_size + 2), GFP_KERNEL);
1702 if (!(*decrypted_name)) { 1720 if (!(*copied_name)) {
1703 error = -ENOMEM; 1721 rc = -ENOMEM;
1704 goto out; 1722 goto out;
1705 } 1723 }
1706 /* TODO: Filename encryption is a scheduled feature for a 1724 memcpy((void *)(*copied_name), (void *)name, name_size);
1707 * future version of eCryptfs. This function is here only for 1725 (*copied_name)[(name_size)] = '\0'; /* Only for convenience
1708 * the purpose of providing a framework for other developers
1709 * to easily implement filename encryption. Hint: Replace this
1710 * memcpy() with a call to decode and decrypt the
1711 * filename, the set the length accordingly. */
1712 memcpy((void *)(*decrypted_name), (void *)name, length);
1713 (*decrypted_name)[length + 1] = '\0'; /* Only for convenience
1714 * in printing out the 1726 * in printing out the
1715 * string in debug 1727 * string in debug
1716 * messages */ 1728 * messages */
1717 error = length; 1729 (*copied_name_size) = (name_size + 1);
1718out: 1730out:
1719 return error; 1731 return rc;
1720} 1732}
1721 1733
1722/** 1734/**
@@ -1740,7 +1752,7 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm,
1740 *key_tfm = NULL; 1752 *key_tfm = NULL;
1741 if (*key_size > ECRYPTFS_MAX_KEY_BYTES) { 1753 if (*key_size > ECRYPTFS_MAX_KEY_BYTES) {
1742 rc = -EINVAL; 1754 rc = -EINVAL;
1743 printk(KERN_ERR "Requested key size is [%Zd] bytes; maximum " 1755 printk(KERN_ERR "Requested key size is [%zd] bytes; maximum "
1744 "allowable is [%d]\n", *key_size, ECRYPTFS_MAX_KEY_BYTES); 1756 "allowable is [%d]\n", *key_size, ECRYPTFS_MAX_KEY_BYTES);
1745 goto out; 1757 goto out;
1746 } 1758 }
@@ -1765,7 +1777,7 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm,
1765 get_random_bytes(dummy_key, *key_size); 1777 get_random_bytes(dummy_key, *key_size);
1766 rc = crypto_blkcipher_setkey(*key_tfm, dummy_key, *key_size); 1778 rc = crypto_blkcipher_setkey(*key_tfm, dummy_key, *key_size);
1767 if (rc) { 1779 if (rc) {
1768 printk(KERN_ERR "Error attempting to set key of size [%Zd] for " 1780 printk(KERN_ERR "Error attempting to set key of size [%zd] for "
1769 "cipher [%s]; rc = [%d]\n", *key_size, cipher_name, rc); 1781 "cipher [%s]; rc = [%d]\n", *key_size, cipher_name, rc);
1770 rc = -EINVAL; 1782 rc = -EINVAL;
1771 goto out; 1783 goto out;
@@ -1910,3 +1922,341 @@ out:
1910 mutex_unlock(&key_tfm_list_mutex); 1922 mutex_unlock(&key_tfm_list_mutex);
1911 return rc; 1923 return rc;
1912} 1924}
1925
1926/* 64 characters forming a 6-bit target field */
1927static unsigned char *portable_filename_chars = ("-.0123456789ABCD"
1928 "EFGHIJKLMNOPQRST"
1929 "UVWXYZabcdefghij"
1930 "klmnopqrstuvwxyz");
1931
1932/* We could either offset on every reverse map or just pad some 0x00's
1933 * at the front here */
1934static const unsigned char filename_rev_map[] = {
1935 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 7 */
1936 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 15 */
1937 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 23 */
1938 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 31 */
1939 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 39 */
1940 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, /* 47 */
1941 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, /* 55 */
1942 0x0A, 0x0B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 63 */
1943 0x00, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, /* 71 */
1944 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, /* 79 */
1945 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, /* 87 */
1946 0x23, 0x24, 0x25, 0x00, 0x00, 0x00, 0x00, 0x00, /* 95 */
1947 0x00, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, /* 103 */
1948 0x2D, 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, /* 111 */
1949 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, /* 119 */
1950 0x3D, 0x3E, 0x3F
1951};
1952
1953/**
1954 * ecryptfs_encode_for_filename
1955 * @dst: Destination location for encoded filename
1956 * @dst_size: Size of the encoded filename in bytes
1957 * @src: Source location for the filename to encode
1958 * @src_size: Size of the source in bytes
1959 */
1960void ecryptfs_encode_for_filename(unsigned char *dst, size_t *dst_size,
1961 unsigned char *src, size_t src_size)
1962{
1963 size_t num_blocks;
1964 size_t block_num = 0;
1965 size_t dst_offset = 0;
1966 unsigned char last_block[3];
1967
1968 if (src_size == 0) {
1969 (*dst_size) = 0;
1970 goto out;
1971 }
1972 num_blocks = (src_size / 3);
1973 if ((src_size % 3) == 0) {
1974 memcpy(last_block, (&src[src_size - 3]), 3);
1975 } else {
1976 num_blocks++;
1977 last_block[2] = 0x00;
1978 switch (src_size % 3) {
1979 case 1:
1980 last_block[0] = src[src_size - 1];
1981 last_block[1] = 0x00;
1982 break;
1983 case 2:
1984 last_block[0] = src[src_size - 2];
1985 last_block[1] = src[src_size - 1];
1986 }
1987 }
1988 (*dst_size) = (num_blocks * 4);
1989 if (!dst)
1990 goto out;
1991 while (block_num < num_blocks) {
1992 unsigned char *src_block;
1993 unsigned char dst_block[4];
1994
1995 if (block_num == (num_blocks - 1))
1996 src_block = last_block;
1997 else
1998 src_block = &src[block_num * 3];
1999 dst_block[0] = ((src_block[0] >> 2) & 0x3F);
2000 dst_block[1] = (((src_block[0] << 4) & 0x30)
2001 | ((src_block[1] >> 4) & 0x0F));
2002 dst_block[2] = (((src_block[1] << 2) & 0x3C)
2003 | ((src_block[2] >> 6) & 0x03));
2004 dst_block[3] = (src_block[2] & 0x3F);
2005 dst[dst_offset++] = portable_filename_chars[dst_block[0]];
2006 dst[dst_offset++] = portable_filename_chars[dst_block[1]];
2007 dst[dst_offset++] = portable_filename_chars[dst_block[2]];
2008 dst[dst_offset++] = portable_filename_chars[dst_block[3]];
2009 block_num++;
2010 }
2011out:
2012 return;
2013}
2014
2015/**
2016 * ecryptfs_decode_from_filename
2017 * @dst: If NULL, this function only sets @dst_size and returns. If
2018 * non-NULL, this function decodes the encoded octets in @src
2019 * into the memory that @dst points to.
2020 * @dst_size: Set to the size of the decoded string.
2021 * @src: The encoded set of octets to decode.
2022 * @src_size: The size of the encoded set of octets to decode.
2023 */
2024static void
2025ecryptfs_decode_from_filename(unsigned char *dst, size_t *dst_size,
2026 const unsigned char *src, size_t src_size)
2027{
2028 u8 current_bit_offset = 0;
2029 size_t src_byte_offset = 0;
2030 size_t dst_byte_offset = 0;
2031
2032 if (dst == NULL) {
2033 /* Not exact; conservatively long. Every block of 4
2034 * encoded characters decodes into a block of 3
2035 * decoded characters. This segment of code provides
2036 * the caller with the maximum amount of allocated
2037 * space that @dst will need to point to in a
2038 * subsequent call. */
2039 (*dst_size) = (((src_size + 1) * 3) / 4);
2040 goto out;
2041 }
2042 while (src_byte_offset < src_size) {
2043 unsigned char src_byte =
2044 filename_rev_map[(int)src[src_byte_offset]];
2045
2046 switch (current_bit_offset) {
2047 case 0:
2048 dst[dst_byte_offset] = (src_byte << 2);
2049 current_bit_offset = 6;
2050 break;
2051 case 6:
2052 dst[dst_byte_offset++] |= (src_byte >> 4);
2053 dst[dst_byte_offset] = ((src_byte & 0xF)
2054 << 4);
2055 current_bit_offset = 4;
2056 break;
2057 case 4:
2058 dst[dst_byte_offset++] |= (src_byte >> 2);
2059 dst[dst_byte_offset] = (src_byte << 6);
2060 current_bit_offset = 2;
2061 break;
2062 case 2:
2063 dst[dst_byte_offset++] |= (src_byte);
2064 dst[dst_byte_offset] = 0;
2065 current_bit_offset = 0;
2066 break;
2067 }
2068 src_byte_offset++;
2069 }
2070 (*dst_size) = dst_byte_offset;
2071out:
2072 return;
2073}
2074
2075/**
2076 * ecryptfs_encrypt_and_encode_filename - converts a plaintext file name to cipher text
2077 * @crypt_stat: The crypt_stat struct associated with the file anem to encode
2078 * @name: The plaintext name
2079 * @length: The length of the plaintext
2080 * @encoded_name: The encypted name
2081 *
2082 * Encrypts and encodes a filename into something that constitutes a
2083 * valid filename for a filesystem, with printable characters.
2084 *
2085 * We assume that we have a properly initialized crypto context,
2086 * pointed to by crypt_stat->tfm.
2087 *
2088 * Returns zero on success; non-zero on otherwise
2089 */
2090int ecryptfs_encrypt_and_encode_filename(
2091 char **encoded_name,
2092 size_t *encoded_name_size,
2093 struct ecryptfs_crypt_stat *crypt_stat,
2094 struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
2095 const char *name, size_t name_size)
2096{
2097 size_t encoded_name_no_prefix_size;
2098 int rc = 0;
2099
2100 (*encoded_name) = NULL;
2101 (*encoded_name_size) = 0;
2102 if ((crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCRYPT_FILENAMES))
2103 || (mount_crypt_stat && (mount_crypt_stat->flags
2104 & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES))) {
2105 struct ecryptfs_filename *filename;
2106
2107 filename = kzalloc(sizeof(*filename), GFP_KERNEL);
2108 if (!filename) {
2109 printk(KERN_ERR "%s: Out of memory whilst attempting "
2110 "to kzalloc [%zd] bytes\n", __func__,
2111 sizeof(*filename));
2112 rc = -ENOMEM;
2113 goto out;
2114 }
2115 filename->filename = (char *)name;
2116 filename->filename_size = name_size;
2117 rc = ecryptfs_encrypt_filename(filename, crypt_stat,
2118 mount_crypt_stat);
2119 if (rc) {
2120 printk(KERN_ERR "%s: Error attempting to encrypt "
2121 "filename; rc = [%d]\n", __func__, rc);
2122 kfree(filename);
2123 goto out;
2124 }
2125 ecryptfs_encode_for_filename(
2126 NULL, &encoded_name_no_prefix_size,
2127 filename->encrypted_filename,
2128 filename->encrypted_filename_size);
2129 if ((crypt_stat && (crypt_stat->flags
2130 & ECRYPTFS_ENCFN_USE_MOUNT_FNEK))
2131 || (mount_crypt_stat
2132 && (mount_crypt_stat->flags
2133 & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK)))
2134 (*encoded_name_size) =
2135 (ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE
2136 + encoded_name_no_prefix_size);
2137 else
2138 (*encoded_name_size) =
2139 (ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX_SIZE
2140 + encoded_name_no_prefix_size);
2141 (*encoded_name) = kmalloc((*encoded_name_size) + 1, GFP_KERNEL);
2142 if (!(*encoded_name)) {
2143 printk(KERN_ERR "%s: Out of memory whilst attempting "
2144 "to kzalloc [%zd] bytes\n", __func__,
2145 (*encoded_name_size));
2146 rc = -ENOMEM;
2147 kfree(filename->encrypted_filename);
2148 kfree(filename);
2149 goto out;
2150 }
2151 if ((crypt_stat && (crypt_stat->flags
2152 & ECRYPTFS_ENCFN_USE_MOUNT_FNEK))
2153 || (mount_crypt_stat
2154 && (mount_crypt_stat->flags
2155 & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK))) {
2156 memcpy((*encoded_name),
2157 ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX,
2158 ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE);
2159 ecryptfs_encode_for_filename(
2160 ((*encoded_name)
2161 + ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE),
2162 &encoded_name_no_prefix_size,
2163 filename->encrypted_filename,
2164 filename->encrypted_filename_size);
2165 (*encoded_name_size) =
2166 (ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE
2167 + encoded_name_no_prefix_size);
2168 (*encoded_name)[(*encoded_name_size)] = '\0';
2169 (*encoded_name_size)++;
2170 } else {
2171 rc = -ENOTSUPP;
2172 }
2173 if (rc) {
2174 printk(KERN_ERR "%s: Error attempting to encode "
2175 "encrypted filename; rc = [%d]\n", __func__,
2176 rc);
2177 kfree((*encoded_name));
2178 (*encoded_name) = NULL;
2179 (*encoded_name_size) = 0;
2180 }
2181 kfree(filename->encrypted_filename);
2182 kfree(filename);
2183 } else {
2184 rc = ecryptfs_copy_filename(encoded_name,
2185 encoded_name_size,
2186 name, name_size);
2187 }
2188out:
2189 return rc;
2190}
2191
2192/**
2193 * ecryptfs_decode_and_decrypt_filename - converts the encoded cipher text name to decoded plaintext
2194 * @plaintext_name: The plaintext name
2195 * @plaintext_name_size: The plaintext name size
2196 * @ecryptfs_dir_dentry: eCryptfs directory dentry
2197 * @name: The filename in cipher text
2198 * @name_size: The cipher text name size
2199 *
2200 * Decrypts and decodes the filename.
2201 *
2202 * Returns zero on error; non-zero otherwise
2203 */
2204int ecryptfs_decode_and_decrypt_filename(char **plaintext_name,
2205 size_t *plaintext_name_size,
2206 struct dentry *ecryptfs_dir_dentry,
2207 const char *name, size_t name_size)
2208{
2209 char *decoded_name;
2210 size_t decoded_name_size;
2211 size_t packet_size;
2212 int rc = 0;
2213
2214 if ((name_size > ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE)
2215 && (strncmp(name, ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX,
2216 ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE) == 0)) {
2217 struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
2218 &ecryptfs_superblock_to_private(
2219 ecryptfs_dir_dentry->d_sb)->mount_crypt_stat;
2220 const char *orig_name = name;
2221 size_t orig_name_size = name_size;
2222
2223 name += ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE;
2224 name_size -= ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE;
2225 ecryptfs_decode_from_filename(NULL, &decoded_name_size,
2226 name, name_size);
2227 decoded_name = kmalloc(decoded_name_size, GFP_KERNEL);
2228 if (!decoded_name) {
2229 printk(KERN_ERR "%s: Out of memory whilst attempting "
2230 "to kmalloc [%zd] bytes\n", __func__,
2231 decoded_name_size);
2232 rc = -ENOMEM;
2233 goto out;
2234 }
2235 ecryptfs_decode_from_filename(decoded_name, &decoded_name_size,
2236 name, name_size);
2237 rc = ecryptfs_parse_tag_70_packet(plaintext_name,
2238 plaintext_name_size,
2239 &packet_size,
2240 mount_crypt_stat,
2241 decoded_name,
2242 decoded_name_size);
2243 if (rc) {
2244 printk(KERN_INFO "%s: Could not parse tag 70 packet "
2245 "from filename; copying through filename "
2246 "as-is\n", __func__);
2247 rc = ecryptfs_copy_filename(plaintext_name,
2248 plaintext_name_size,
2249 orig_name, orig_name_size);
2250 goto out_free;
2251 }
2252 } else {
2253 rc = ecryptfs_copy_filename(plaintext_name,
2254 plaintext_name_size,
2255 name, name_size);
2256 goto out;
2257 }
2258out_free:
2259 kfree(decoded_name);
2260out:
2261 return rc;
2262}
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index a75026d35d16..c11fc95714ab 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -51,12 +51,16 @@
51#define ECRYPTFS_VERSIONING_XATTR 0x00000010 51#define ECRYPTFS_VERSIONING_XATTR 0x00000010
52#define ECRYPTFS_VERSIONING_MULTKEY 0x00000020 52#define ECRYPTFS_VERSIONING_MULTKEY 0x00000020
53#define ECRYPTFS_VERSIONING_DEVMISC 0x00000040 53#define ECRYPTFS_VERSIONING_DEVMISC 0x00000040
54#define ECRYPTFS_VERSIONING_HMAC 0x00000080
55#define ECRYPTFS_VERSIONING_FILENAME_ENCRYPTION 0x00000100
56#define ECRYPTFS_VERSIONING_GCM 0x00000200
54#define ECRYPTFS_VERSIONING_MASK (ECRYPTFS_VERSIONING_PASSPHRASE \ 57#define ECRYPTFS_VERSIONING_MASK (ECRYPTFS_VERSIONING_PASSPHRASE \
55 | ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH \ 58 | ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH \
56 | ECRYPTFS_VERSIONING_PUBKEY \ 59 | ECRYPTFS_VERSIONING_PUBKEY \
57 | ECRYPTFS_VERSIONING_XATTR \ 60 | ECRYPTFS_VERSIONING_XATTR \
58 | ECRYPTFS_VERSIONING_MULTKEY \ 61 | ECRYPTFS_VERSIONING_MULTKEY \
59 | ECRYPTFS_VERSIONING_DEVMISC) 62 | ECRYPTFS_VERSIONING_DEVMISC \
63 | ECRYPTFS_VERSIONING_FILENAME_ENCRYPTION)
60#define ECRYPTFS_MAX_PASSWORD_LENGTH 64 64#define ECRYPTFS_MAX_PASSWORD_LENGTH 64
61#define ECRYPTFS_MAX_PASSPHRASE_BYTES ECRYPTFS_MAX_PASSWORD_LENGTH 65#define ECRYPTFS_MAX_PASSPHRASE_BYTES ECRYPTFS_MAX_PASSWORD_LENGTH
62#define ECRYPTFS_SALT_SIZE 8 66#define ECRYPTFS_SALT_SIZE 8
@@ -199,6 +203,7 @@ ecryptfs_get_key_payload_data(struct key *key)
199#define ECRYPTFS_DEFAULT_CIPHER "aes" 203#define ECRYPTFS_DEFAULT_CIPHER "aes"
200#define ECRYPTFS_DEFAULT_KEY_BYTES 16 204#define ECRYPTFS_DEFAULT_KEY_BYTES 16
201#define ECRYPTFS_DEFAULT_HASH "md5" 205#define ECRYPTFS_DEFAULT_HASH "md5"
206#define ECRYPTFS_TAG_70_DIGEST ECRYPTFS_DEFAULT_HASH
202#define ECRYPTFS_TAG_1_PACKET_TYPE 0x01 207#define ECRYPTFS_TAG_1_PACKET_TYPE 0x01
203#define ECRYPTFS_TAG_3_PACKET_TYPE 0x8C 208#define ECRYPTFS_TAG_3_PACKET_TYPE 0x8C
204#define ECRYPTFS_TAG_11_PACKET_TYPE 0xED 209#define ECRYPTFS_TAG_11_PACKET_TYPE 0xED
@@ -206,30 +211,64 @@ ecryptfs_get_key_payload_data(struct key *key)
206#define ECRYPTFS_TAG_65_PACKET_TYPE 0x41 211#define ECRYPTFS_TAG_65_PACKET_TYPE 0x41
207#define ECRYPTFS_TAG_66_PACKET_TYPE 0x42 212#define ECRYPTFS_TAG_66_PACKET_TYPE 0x42
208#define ECRYPTFS_TAG_67_PACKET_TYPE 0x43 213#define ECRYPTFS_TAG_67_PACKET_TYPE 0x43
214#define ECRYPTFS_TAG_70_PACKET_TYPE 0x46 /* FNEK-encrypted filename
215 * as dentry name */
216#define ECRYPTFS_TAG_71_PACKET_TYPE 0x47 /* FNEK-encrypted filename in
217 * metadata */
218#define ECRYPTFS_TAG_72_PACKET_TYPE 0x48 /* FEK-encrypted filename as
219 * dentry name */
220#define ECRYPTFS_TAG_73_PACKET_TYPE 0x49 /* FEK-encrypted filename as
221 * metadata */
222/* Constraint: ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES >=
223 * ECRYPTFS_MAX_IV_BYTES */
224#define ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES 16
225#define ECRYPTFS_NON_NULL 0x42 /* A reasonable substitute for NULL */
209#define MD5_DIGEST_SIZE 16 226#define MD5_DIGEST_SIZE 16
227#define ECRYPTFS_TAG_70_DIGEST_SIZE MD5_DIGEST_SIZE
228#define ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX "ECRYPTFS_FEK_ENCRYPTED."
229#define ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX_SIZE 23
230#define ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX "ECRYPTFS_FNEK_ENCRYPTED."
231#define ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE 24
232#define ECRYPTFS_ENCRYPTED_DENTRY_NAME_LEN (18 + 1 + 4 + 1 + 32)
210 233
211struct ecryptfs_key_sig { 234struct ecryptfs_key_sig {
212 struct list_head crypt_stat_list; 235 struct list_head crypt_stat_list;
213 char keysig[ECRYPTFS_SIG_SIZE_HEX]; 236 char keysig[ECRYPTFS_SIG_SIZE_HEX];
214}; 237};
215 238
239struct ecryptfs_filename {
240 struct list_head crypt_stat_list;
241#define ECRYPTFS_FILENAME_CONTAINS_DECRYPTED 0x00000001
242 u32 flags;
243 u32 seq_no;
244 char *filename;
245 char *encrypted_filename;
246 size_t filename_size;
247 size_t encrypted_filename_size;
248 char fnek_sig[ECRYPTFS_SIG_SIZE_HEX];
249 char dentry_name[ECRYPTFS_ENCRYPTED_DENTRY_NAME_LEN + 1];
250};
251
216/** 252/**
217 * This is the primary struct associated with each encrypted file. 253 * This is the primary struct associated with each encrypted file.
218 * 254 *
219 * TODO: cache align/pack? 255 * TODO: cache align/pack?
220 */ 256 */
221struct ecryptfs_crypt_stat { 257struct ecryptfs_crypt_stat {
222#define ECRYPTFS_STRUCT_INITIALIZED 0x00000001 258#define ECRYPTFS_STRUCT_INITIALIZED 0x00000001
223#define ECRYPTFS_POLICY_APPLIED 0x00000002 259#define ECRYPTFS_POLICY_APPLIED 0x00000002
224#define ECRYPTFS_NEW_FILE 0x00000004 260#define ECRYPTFS_NEW_FILE 0x00000004
225#define ECRYPTFS_ENCRYPTED 0x00000008 261#define ECRYPTFS_ENCRYPTED 0x00000008
226#define ECRYPTFS_SECURITY_WARNING 0x00000010 262#define ECRYPTFS_SECURITY_WARNING 0x00000010
227#define ECRYPTFS_ENABLE_HMAC 0x00000020 263#define ECRYPTFS_ENABLE_HMAC 0x00000020
228#define ECRYPTFS_ENCRYPT_IV_PAGES 0x00000040 264#define ECRYPTFS_ENCRYPT_IV_PAGES 0x00000040
229#define ECRYPTFS_KEY_VALID 0x00000080 265#define ECRYPTFS_KEY_VALID 0x00000080
230#define ECRYPTFS_METADATA_IN_XATTR 0x00000100 266#define ECRYPTFS_METADATA_IN_XATTR 0x00000100
231#define ECRYPTFS_VIEW_AS_ENCRYPTED 0x00000200 267#define ECRYPTFS_VIEW_AS_ENCRYPTED 0x00000200
232#define ECRYPTFS_KEY_SET 0x00000400 268#define ECRYPTFS_KEY_SET 0x00000400
269#define ECRYPTFS_ENCRYPT_FILENAMES 0x00000800
270#define ECRYPTFS_ENCFN_USE_MOUNT_FNEK 0x00001000
271#define ECRYPTFS_ENCFN_USE_FEK 0x00002000
233 u32 flags; 272 u32 flags;
234 unsigned int file_version; 273 unsigned int file_version;
235 size_t iv_bytes; 274 size_t iv_bytes;
@@ -332,13 +371,20 @@ struct ecryptfs_mount_crypt_stat {
332#define ECRYPTFS_XATTR_METADATA_ENABLED 0x00000002 371#define ECRYPTFS_XATTR_METADATA_ENABLED 0x00000002
333#define ECRYPTFS_ENCRYPTED_VIEW_ENABLED 0x00000004 372#define ECRYPTFS_ENCRYPTED_VIEW_ENABLED 0x00000004
334#define ECRYPTFS_MOUNT_CRYPT_STAT_INITIALIZED 0x00000008 373#define ECRYPTFS_MOUNT_CRYPT_STAT_INITIALIZED 0x00000008
374#define ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES 0x00000010
375#define ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK 0x00000020
376#define ECRYPTFS_GLOBAL_ENCFN_USE_FEK 0x00000040
335 u32 flags; 377 u32 flags;
336 struct list_head global_auth_tok_list; 378 struct list_head global_auth_tok_list;
337 struct mutex global_auth_tok_list_mutex; 379 struct mutex global_auth_tok_list_mutex;
338 size_t num_global_auth_toks; 380 size_t num_global_auth_toks;
339 size_t global_default_cipher_key_size; 381 size_t global_default_cipher_key_size;
382 size_t global_default_fn_cipher_key_bytes;
340 unsigned char global_default_cipher_name[ECRYPTFS_MAX_CIPHER_NAME_SIZE 383 unsigned char global_default_cipher_name[ECRYPTFS_MAX_CIPHER_NAME_SIZE
341 + 1]; 384 + 1];
385 unsigned char global_default_fn_cipher_name[
386 ECRYPTFS_MAX_CIPHER_NAME_SIZE + 1];
387 char global_default_fnek_sig[ECRYPTFS_SIG_SIZE_HEX + 1];
342}; 388};
343 389
344/* superblock private data. */ 390/* superblock private data. */
@@ -571,13 +617,22 @@ struct ecryptfs_open_req {
571int ecryptfs_interpose(struct dentry *hidden_dentry, 617int ecryptfs_interpose(struct dentry *hidden_dentry,
572 struct dentry *this_dentry, struct super_block *sb, 618 struct dentry *this_dentry, struct super_block *sb,
573 u32 flags); 619 u32 flags);
620int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
621 struct dentry *lower_dentry,
622 struct ecryptfs_crypt_stat *crypt_stat,
623 struct inode *ecryptfs_dir_inode,
624 struct nameidata *ecryptfs_nd);
625int ecryptfs_decode_and_decrypt_filename(char **decrypted_name,
626 size_t *decrypted_name_size,
627 struct dentry *ecryptfs_dentry,
628 const char *name, size_t name_size);
574int ecryptfs_fill_zeros(struct file *file, loff_t new_length); 629int ecryptfs_fill_zeros(struct file *file, loff_t new_length);
575int ecryptfs_decode_filename(struct ecryptfs_crypt_stat *crypt_stat, 630int ecryptfs_encrypt_and_encode_filename(
576 const char *name, int length, 631 char **encoded_name,
577 char **decrypted_name); 632 size_t *encoded_name_size,
578int ecryptfs_encode_filename(struct ecryptfs_crypt_stat *crypt_stat, 633 struct ecryptfs_crypt_stat *crypt_stat,
579 const char *name, int length, 634 struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
580 char **encoded_name); 635 const char *name, size_t name_size);
581struct dentry *ecryptfs_lower_dentry(struct dentry *this_dentry); 636struct dentry *ecryptfs_lower_dentry(struct dentry *this_dentry);
582void ecryptfs_dump_hex(char *data, int bytes); 637void ecryptfs_dump_hex(char *data, int bytes);
583int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg, 638int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg,
@@ -599,7 +654,7 @@ int ecryptfs_read_and_validate_header_region(char *data,
599 struct inode *ecryptfs_inode); 654 struct inode *ecryptfs_inode);
600int ecryptfs_read_and_validate_xattr_region(char *page_virt, 655int ecryptfs_read_and_validate_xattr_region(char *page_virt,
601 struct dentry *ecryptfs_dentry); 656 struct dentry *ecryptfs_dentry);
602u8 ecryptfs_code_for_cipher_string(struct ecryptfs_crypt_stat *crypt_stat); 657u8 ecryptfs_code_for_cipher_string(char *cipher_name, size_t key_bytes);
603int ecryptfs_cipher_code_to_string(char *str, u8 cipher_code); 658int ecryptfs_cipher_code_to_string(char *str, u8 cipher_code);
604void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat); 659void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat);
605int ecryptfs_generate_key_packet_set(char *dest_base, 660int ecryptfs_generate_key_packet_set(char *dest_base,
@@ -694,5 +749,17 @@ int ecryptfs_privileged_open(struct file **lower_file,
694 struct vfsmount *lower_mnt, 749 struct vfsmount *lower_mnt,
695 const struct cred *cred); 750 const struct cred *cred);
696int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry); 751int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry);
752int
753ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
754 size_t *packet_size,
755 struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
756 char *filename, size_t filename_size);
757int
758ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
759 size_t *packet_size,
760 struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
761 char *data, size_t max_packet_size);
762int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat,
763 loff_t offset);
697 764
698#endif /* #ifndef ECRYPTFS_KERNEL_H */ 765#endif /* #ifndef ECRYPTFS_KERNEL_H */
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index eb3dc4c7ac06..9e944057001b 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -77,27 +77,27 @@ struct ecryptfs_getdents_callback {
77 77
78/* Inspired by generic filldir in fs/readdir.c */ 78/* Inspired by generic filldir in fs/readdir.c */
79static int 79static int
80ecryptfs_filldir(void *dirent, const char *name, int namelen, loff_t offset, 80ecryptfs_filldir(void *dirent, const char *lower_name, int lower_namelen,
81 u64 ino, unsigned int d_type) 81 loff_t offset, u64 ino, unsigned int d_type)
82{ 82{
83 struct ecryptfs_crypt_stat *crypt_stat;
84 struct ecryptfs_getdents_callback *buf = 83 struct ecryptfs_getdents_callback *buf =
85 (struct ecryptfs_getdents_callback *)dirent; 84 (struct ecryptfs_getdents_callback *)dirent;
85 size_t name_size;
86 char *name;
86 int rc; 87 int rc;
87 int decoded_length;
88 char *decoded_name;
89 88
90 crypt_stat = ecryptfs_dentry_to_private(buf->dentry)->crypt_stat;
91 buf->filldir_called++; 89 buf->filldir_called++;
92 decoded_length = ecryptfs_decode_filename(crypt_stat, name, namelen, 90 rc = ecryptfs_decode_and_decrypt_filename(&name, &name_size,
93 &decoded_name); 91 buf->dentry, lower_name,
94 if (decoded_length < 0) { 92 lower_namelen);
95 rc = decoded_length; 93 if (rc) {
94 printk(KERN_ERR "%s: Error attempting to decode and decrypt "
95 "filename [%s]; rc = [%d]\n", __func__, lower_name,
96 rc);
96 goto out; 97 goto out;
97 } 98 }
98 rc = buf->filldir(buf->dirent, decoded_name, decoded_length, offset, 99 rc = buf->filldir(buf->dirent, name, name_size, offset, ino, d_type);
99 ino, d_type); 100 kfree(name);
100 kfree(decoded_name);
101 if (rc >= 0) 101 if (rc >= 0)
102 buf->entries_written++; 102 buf->entries_written++;
103out: 103out:
@@ -106,8 +106,8 @@ out:
106 106
107/** 107/**
108 * ecryptfs_readdir 108 * ecryptfs_readdir
109 * @file: The ecryptfs file struct 109 * @file: The eCryptfs directory file
110 * @dirent: Directory entry 110 * @dirent: Directory entry handle
111 * @filldir: The filldir callback function 111 * @filldir: The filldir callback function
112 */ 112 */
113static int ecryptfs_readdir(struct file *file, void *dirent, filldir_t filldir) 113static int ecryptfs_readdir(struct file *file, void *dirent, filldir_t filldir)
@@ -275,18 +275,9 @@ static int ecryptfs_release(struct inode *inode, struct file *file)
275static int 275static int
276ecryptfs_fsync(struct file *file, struct dentry *dentry, int datasync) 276ecryptfs_fsync(struct file *file, struct dentry *dentry, int datasync)
277{ 277{
278 struct file *lower_file = ecryptfs_file_to_lower(file); 278 return vfs_fsync(ecryptfs_file_to_lower(file),
279 struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); 279 ecryptfs_dentry_to_lower(dentry),
280 struct inode *lower_inode = lower_dentry->d_inode; 280 datasync);
281 int rc = -EINVAL;
282
283 if (lower_inode->i_fop->fsync) {
284 mutex_lock(&lower_inode->i_mutex);
285 rc = lower_inode->i_fop->fsync(lower_file, lower_dentry,
286 datasync);
287 mutex_unlock(&lower_inode->i_mutex);
288 }
289 return rc;
290} 281}
291 282
292static int ecryptfs_fasync(int fd, struct file *file, int flag) 283static int ecryptfs_fasync(int fd, struct file *file, int flag)
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 89209f00f9c7..5697899a168d 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -52,8 +52,7 @@ static void unlock_dir(struct dentry *dir)
52/** 52/**
53 * ecryptfs_create_underlying_file 53 * ecryptfs_create_underlying_file
54 * @lower_dir_inode: inode of the parent in the lower fs of the new file 54 * @lower_dir_inode: inode of the parent in the lower fs of the new file
55 * @lower_dentry: New file's dentry in the lower fs 55 * @dentry: New file's dentry
56 * @ecryptfs_dentry: New file's dentry in ecryptfs
57 * @mode: The mode of the new file 56 * @mode: The mode of the new file
58 * @nd: nameidata of ecryptfs' parent's dentry & vfsmount 57 * @nd: nameidata of ecryptfs' parent's dentry & vfsmount
59 * 58 *
@@ -228,8 +227,7 @@ ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry,
228{ 227{
229 int rc; 228 int rc;
230 229
231 /* ecryptfs_do_create() calls ecryptfs_interpose(), which opens 230 /* ecryptfs_do_create() calls ecryptfs_interpose() */
232 * the crypt_stat->lower_file (persistent file) */
233 rc = ecryptfs_do_create(directory_inode, ecryptfs_dentry, mode, nd); 231 rc = ecryptfs_do_create(directory_inode, ecryptfs_dentry, mode, nd);
234 if (unlikely(rc)) { 232 if (unlikely(rc)) {
235 ecryptfs_printk(KERN_WARNING, "Failed to create file in" 233 ecryptfs_printk(KERN_WARNING, "Failed to create file in"
@@ -244,141 +242,91 @@ out:
244} 242}
245 243
246/** 244/**
247 * ecryptfs_lookup 245 * ecryptfs_lookup_and_interpose_lower - Perform a lookup
248 * @dir: inode
249 * @dentry: The dentry
250 * @nd: nameidata, may be NULL
251 *
252 * Find a file on disk. If the file does not exist, then we'll add it to the
253 * dentry cache and continue on to read it from the disk.
254 */ 246 */
255static struct dentry *ecryptfs_lookup(struct inode *dir, struct dentry *dentry, 247int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
256 struct nameidata *nd) 248 struct dentry *lower_dentry,
249 struct ecryptfs_crypt_stat *crypt_stat,
250 struct inode *ecryptfs_dir_inode,
251 struct nameidata *ecryptfs_nd)
257{ 252{
258 int rc = 0;
259 struct dentry *lower_dir_dentry; 253 struct dentry *lower_dir_dentry;
260 struct dentry *lower_dentry;
261 struct vfsmount *lower_mnt; 254 struct vfsmount *lower_mnt;
262 char *encoded_name; 255 struct inode *lower_inode;
263 int encoded_namelen;
264 struct ecryptfs_crypt_stat *crypt_stat = NULL;
265 struct ecryptfs_mount_crypt_stat *mount_crypt_stat; 256 struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
266 char *page_virt = NULL; 257 char *page_virt = NULL;
267 struct inode *lower_inode;
268 u64 file_size; 258 u64 file_size;
259 int rc = 0;
269 260
270 lower_dir_dentry = ecryptfs_dentry_to_lower(dentry->d_parent); 261 lower_dir_dentry = lower_dentry->d_parent;
271 dentry->d_op = &ecryptfs_dops; 262 lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(
272 if ((dentry->d_name.len == 1 && !strcmp(dentry->d_name.name, ".")) 263 ecryptfs_dentry->d_parent));
273 || (dentry->d_name.len == 2
274 && !strcmp(dentry->d_name.name, ".."))) {
275 d_drop(dentry);
276 goto out;
277 }
278 encoded_namelen = ecryptfs_encode_filename(crypt_stat,
279 dentry->d_name.name,
280 dentry->d_name.len,
281 &encoded_name);
282 if (encoded_namelen < 0) {
283 rc = encoded_namelen;
284 d_drop(dentry);
285 goto out;
286 }
287 ecryptfs_printk(KERN_DEBUG, "encoded_name = [%s]; encoded_namelen "
288 "= [%d]\n", encoded_name, encoded_namelen);
289 lower_dentry = lookup_one_len(encoded_name, lower_dir_dentry,
290 encoded_namelen - 1);
291 kfree(encoded_name);
292 if (IS_ERR(lower_dentry)) {
293 ecryptfs_printk(KERN_ERR, "ERR from lower_dentry\n");
294 rc = PTR_ERR(lower_dentry);
295 d_drop(dentry);
296 goto out;
297 }
298 lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent));
299 ecryptfs_printk(KERN_DEBUG, "lower_dentry = [%p]; lower_dentry->"
300 "d_name.name = [%s]\n", lower_dentry,
301 lower_dentry->d_name.name);
302 lower_inode = lower_dentry->d_inode; 264 lower_inode = lower_dentry->d_inode;
303 fsstack_copy_attr_atime(dir, lower_dir_dentry->d_inode); 265 fsstack_copy_attr_atime(ecryptfs_dir_inode, lower_dir_dentry->d_inode);
304 BUG_ON(!atomic_read(&lower_dentry->d_count)); 266 BUG_ON(!atomic_read(&lower_dentry->d_count));
305 ecryptfs_set_dentry_private(dentry, 267 ecryptfs_set_dentry_private(ecryptfs_dentry,
306 kmem_cache_alloc(ecryptfs_dentry_info_cache, 268 kmem_cache_alloc(ecryptfs_dentry_info_cache,
307 GFP_KERNEL)); 269 GFP_KERNEL));
308 if (!ecryptfs_dentry_to_private(dentry)) { 270 if (!ecryptfs_dentry_to_private(ecryptfs_dentry)) {
309 rc = -ENOMEM; 271 rc = -ENOMEM;
310 ecryptfs_printk(KERN_ERR, "Out of memory whilst attempting " 272 printk(KERN_ERR "%s: Out of memory whilst attempting "
311 "to allocate ecryptfs_dentry_info struct\n"); 273 "to allocate ecryptfs_dentry_info struct\n",
274 __func__);
312 goto out_dput; 275 goto out_dput;
313 } 276 }
314 ecryptfs_set_dentry_lower(dentry, lower_dentry); 277 ecryptfs_set_dentry_lower(ecryptfs_dentry, lower_dentry);
315 ecryptfs_set_dentry_lower_mnt(dentry, lower_mnt); 278 ecryptfs_set_dentry_lower_mnt(ecryptfs_dentry, lower_mnt);
316 if (!lower_dentry->d_inode) { 279 if (!lower_dentry->d_inode) {
317 /* We want to add because we couldn't find in lower */ 280 /* We want to add because we couldn't find in lower */
318 d_add(dentry, NULL); 281 d_add(ecryptfs_dentry, NULL);
319 goto out; 282 goto out;
320 } 283 }
321 rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb, 284 rc = ecryptfs_interpose(lower_dentry, ecryptfs_dentry,
322 ECRYPTFS_INTERPOSE_FLAG_D_ADD); 285 ecryptfs_dir_inode->i_sb, 1);
323 if (rc) { 286 if (rc) {
324 ecryptfs_printk(KERN_ERR, "Error interposing\n"); 287 printk(KERN_ERR "%s: Error interposing; rc = [%d]\n",
288 __func__, rc);
325 goto out; 289 goto out;
326 } 290 }
327 if (S_ISDIR(lower_inode->i_mode)) { 291 if (S_ISDIR(lower_inode->i_mode))
328 ecryptfs_printk(KERN_DEBUG, "Is a directory; returning\n");
329 goto out; 292 goto out;
330 } 293 if (S_ISLNK(lower_inode->i_mode))
331 if (S_ISLNK(lower_inode->i_mode)) {
332 ecryptfs_printk(KERN_DEBUG, "Is a symlink; returning\n");
333 goto out; 294 goto out;
334 } 295 if (special_file(lower_inode->i_mode))
335 if (special_file(lower_inode->i_mode)) {
336 ecryptfs_printk(KERN_DEBUG, "Is a special file; returning\n");
337 goto out; 296 goto out;
338 } 297 if (!ecryptfs_nd)
339 if (!nd) {
340 ecryptfs_printk(KERN_DEBUG, "We have a NULL nd, just leave"
341 "as we *think* we are about to unlink\n");
342 goto out; 298 goto out;
343 }
344 /* Released in this function */ 299 /* Released in this function */
345 page_virt = kmem_cache_zalloc(ecryptfs_header_cache_2, 300 page_virt = kmem_cache_zalloc(ecryptfs_header_cache_2, GFP_USER);
346 GFP_USER);
347 if (!page_virt) { 301 if (!page_virt) {
302 printk(KERN_ERR "%s: Cannot kmem_cache_zalloc() a page\n",
303 __func__);
348 rc = -ENOMEM; 304 rc = -ENOMEM;
349 ecryptfs_printk(KERN_ERR,
350 "Cannot ecryptfs_kmalloc a page\n");
351 goto out; 305 goto out;
352 } 306 }
353 crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat; 307 if (!ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->lower_file) {
354 if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED)) 308 rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
355 ecryptfs_set_default_sizes(crypt_stat);
356 if (!ecryptfs_inode_to_private(dentry->d_inode)->lower_file) {
357 rc = ecryptfs_init_persistent_file(dentry);
358 if (rc) { 309 if (rc) {
359 printk(KERN_ERR "%s: Error attempting to initialize " 310 printk(KERN_ERR "%s: Error attempting to initialize "
360 "the persistent file for the dentry with name " 311 "the persistent file for the dentry with name "
361 "[%s]; rc = [%d]\n", __func__, 312 "[%s]; rc = [%d]\n", __func__,
362 dentry->d_name.name, rc); 313 ecryptfs_dentry->d_name.name, rc);
363 goto out; 314 goto out_free_kmem;
364 } 315 }
365 } 316 }
366 rc = ecryptfs_read_and_validate_header_region(page_virt, 317 rc = ecryptfs_read_and_validate_header_region(page_virt,
367 dentry->d_inode); 318 ecryptfs_dentry->d_inode);
368 if (rc) { 319 if (rc) {
369 rc = ecryptfs_read_and_validate_xattr_region(page_virt, dentry); 320 rc = ecryptfs_read_and_validate_xattr_region(page_virt,
321 ecryptfs_dentry);
370 if (rc) { 322 if (rc) {
371 printk(KERN_DEBUG "Valid metadata not found in header "
372 "region or xattr region; treating file as "
373 "unencrypted\n");
374 rc = 0; 323 rc = 0;
375 kmem_cache_free(ecryptfs_header_cache_2, page_virt); 324 goto out_free_kmem;
376 goto out;
377 } 325 }
378 crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR; 326 crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR;
379 } 327 }
380 mount_crypt_stat = &ecryptfs_superblock_to_private( 328 mount_crypt_stat = &ecryptfs_superblock_to_private(
381 dentry->d_sb)->mount_crypt_stat; 329 ecryptfs_dentry->d_sb)->mount_crypt_stat;
382 if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) { 330 if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) {
383 if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) 331 if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
384 file_size = (crypt_stat->num_header_bytes_at_front 332 file_size = (crypt_stat->num_header_bytes_at_front
@@ -388,14 +336,103 @@ static struct dentry *ecryptfs_lookup(struct inode *dir, struct dentry *dentry,
388 } else { 336 } else {
389 file_size = get_unaligned_be64(page_virt); 337 file_size = get_unaligned_be64(page_virt);
390 } 338 }
391 i_size_write(dentry->d_inode, (loff_t)file_size); 339 i_size_write(ecryptfs_dentry->d_inode, (loff_t)file_size);
340out_free_kmem:
392 kmem_cache_free(ecryptfs_header_cache_2, page_virt); 341 kmem_cache_free(ecryptfs_header_cache_2, page_virt);
393 goto out; 342 goto out;
394
395out_dput: 343out_dput:
396 dput(lower_dentry); 344 dput(lower_dentry);
397 d_drop(dentry); 345 d_drop(ecryptfs_dentry);
398out: 346out:
347 return rc;
348}
349
350/**
351 * ecryptfs_lookup
352 * @ecryptfs_dir_inode: The eCryptfs directory inode
353 * @ecryptfs_dentry: The eCryptfs dentry that we are looking up
354 * @ecryptfs_nd: nameidata; may be NULL
355 *
356 * Find a file on disk. If the file does not exist, then we'll add it to the
357 * dentry cache and continue on to read it from the disk.
358 */
359static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
360 struct dentry *ecryptfs_dentry,
361 struct nameidata *ecryptfs_nd)
362{
363 char *encrypted_and_encoded_name = NULL;
364 size_t encrypted_and_encoded_name_size;
365 struct ecryptfs_crypt_stat *crypt_stat = NULL;
366 struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL;
367 struct ecryptfs_inode_info *inode_info;
368 struct dentry *lower_dir_dentry, *lower_dentry;
369 int rc = 0;
370
371 ecryptfs_dentry->d_op = &ecryptfs_dops;
372 if ((ecryptfs_dentry->d_name.len == 1
373 && !strcmp(ecryptfs_dentry->d_name.name, "."))
374 || (ecryptfs_dentry->d_name.len == 2
375 && !strcmp(ecryptfs_dentry->d_name.name, ".."))) {
376 goto out_d_drop;
377 }
378 lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent);
379 lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name,
380 lower_dir_dentry,
381 ecryptfs_dentry->d_name.len);
382 if (IS_ERR(lower_dentry)) {
383 rc = PTR_ERR(lower_dentry);
384 printk(KERN_ERR "%s: lookup_one_len() returned [%d] on "
385 "lower_dentry = [%s]\n", __func__, rc,
386 ecryptfs_dentry->d_name.name);
387 goto out_d_drop;
388 }
389 if (lower_dentry->d_inode)
390 goto lookup_and_interpose;
391 inode_info = ecryptfs_inode_to_private(ecryptfs_dentry->d_inode);
392 if (inode_info) {
393 crypt_stat = &inode_info->crypt_stat;
394 /* TODO: lock for crypt_stat comparison */
395 if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED))
396 ecryptfs_set_default_sizes(crypt_stat);
397 }
398 if (crypt_stat)
399 mount_crypt_stat = crypt_stat->mount_crypt_stat;
400 else
401 mount_crypt_stat = &ecryptfs_superblock_to_private(
402 ecryptfs_dentry->d_sb)->mount_crypt_stat;
403 if (!(crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCRYPT_FILENAMES))
404 && !(mount_crypt_stat && (mount_crypt_stat->flags
405 & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)))
406 goto lookup_and_interpose;
407 dput(lower_dentry);
408 rc = ecryptfs_encrypt_and_encode_filename(
409 &encrypted_and_encoded_name, &encrypted_and_encoded_name_size,
410 crypt_stat, mount_crypt_stat, ecryptfs_dentry->d_name.name,
411 ecryptfs_dentry->d_name.len);
412 if (rc) {
413 printk(KERN_ERR "%s: Error attempting to encrypt and encode "
414 "filename; rc = [%d]\n", __func__, rc);
415 goto out_d_drop;
416 }
417 lower_dentry = lookup_one_len(encrypted_and_encoded_name,
418 lower_dir_dentry,
419 encrypted_and_encoded_name_size - 1);
420 if (IS_ERR(lower_dentry)) {
421 rc = PTR_ERR(lower_dentry);
422 printk(KERN_ERR "%s: lookup_one_len() returned [%d] on "
423 "lower_dentry = [%s]\n", __func__, rc,
424 encrypted_and_encoded_name);
425 goto out_d_drop;
426 }
427lookup_and_interpose:
428 rc = ecryptfs_lookup_and_interpose_lower(ecryptfs_dentry, lower_dentry,
429 crypt_stat, ecryptfs_dir_inode,
430 ecryptfs_nd);
431 goto out;
432out_d_drop:
433 d_drop(ecryptfs_dentry);
434out:
435 kfree(encrypted_and_encoded_name);
399 return ERR_PTR(rc); 436 return ERR_PTR(rc);
400} 437}
401 438
@@ -466,19 +503,21 @@ static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry,
466 struct dentry *lower_dentry; 503 struct dentry *lower_dentry;
467 struct dentry *lower_dir_dentry; 504 struct dentry *lower_dir_dentry;
468 char *encoded_symname; 505 char *encoded_symname;
469 int encoded_symlen; 506 size_t encoded_symlen;
470 struct ecryptfs_crypt_stat *crypt_stat = NULL; 507 struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL;
471 508
472 lower_dentry = ecryptfs_dentry_to_lower(dentry); 509 lower_dentry = ecryptfs_dentry_to_lower(dentry);
473 dget(lower_dentry); 510 dget(lower_dentry);
474 lower_dir_dentry = lock_parent(lower_dentry); 511 lower_dir_dentry = lock_parent(lower_dentry);
475 encoded_symlen = ecryptfs_encode_filename(crypt_stat, symname, 512 mount_crypt_stat = &ecryptfs_superblock_to_private(
476 strlen(symname), 513 dir->i_sb)->mount_crypt_stat;
477 &encoded_symname); 514 rc = ecryptfs_encrypt_and_encode_filename(&encoded_symname,
478 if (encoded_symlen < 0) { 515 &encoded_symlen,
479 rc = encoded_symlen; 516 NULL,
517 mount_crypt_stat, symname,
518 strlen(symname));
519 if (rc)
480 goto out_lock; 520 goto out_lock;
481 }
482 rc = vfs_symlink(lower_dir_dentry->d_inode, lower_dentry, 521 rc = vfs_symlink(lower_dir_dentry->d_inode, lower_dentry,
483 encoded_symname); 522 encoded_symname);
484 kfree(encoded_symname); 523 kfree(encoded_symname);
@@ -602,53 +641,54 @@ out_lock:
602} 641}
603 642
604static int 643static int
605ecryptfs_readlink(struct dentry *dentry, char __user * buf, int bufsiz) 644ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
606{ 645{
607 int rc;
608 struct dentry *lower_dentry;
609 char *decoded_name;
610 char *lower_buf; 646 char *lower_buf;
611 mm_segment_t old_fs; 647 struct dentry *lower_dentry;
612 struct ecryptfs_crypt_stat *crypt_stat; 648 struct ecryptfs_crypt_stat *crypt_stat;
649 char *plaintext_name;
650 size_t plaintext_name_size;
651 mm_segment_t old_fs;
652 int rc;
613 653
614 lower_dentry = ecryptfs_dentry_to_lower(dentry); 654 lower_dentry = ecryptfs_dentry_to_lower(dentry);
615 if (!lower_dentry->d_inode->i_op || 655 if (!lower_dentry->d_inode->i_op->readlink) {
616 !lower_dentry->d_inode->i_op->readlink) {
617 rc = -EINVAL; 656 rc = -EINVAL;
618 goto out; 657 goto out;
619 } 658 }
659 crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat;
620 /* Released in this function */ 660 /* Released in this function */
621 lower_buf = kmalloc(bufsiz, GFP_KERNEL); 661 lower_buf = kmalloc(bufsiz, GFP_KERNEL);
622 if (lower_buf == NULL) { 662 if (lower_buf == NULL) {
623 ecryptfs_printk(KERN_ERR, "Out of memory\n"); 663 printk(KERN_ERR "%s: Out of memory whilst attempting to "
664 "kmalloc [%d] bytes\n", __func__, bufsiz);
624 rc = -ENOMEM; 665 rc = -ENOMEM;
625 goto out; 666 goto out;
626 } 667 }
627 old_fs = get_fs(); 668 old_fs = get_fs();
628 set_fs(get_ds()); 669 set_fs(get_ds());
629 ecryptfs_printk(KERN_DEBUG, "Calling readlink w/ "
630 "lower_dentry->d_name.name = [%s]\n",
631 lower_dentry->d_name.name);
632 rc = lower_dentry->d_inode->i_op->readlink(lower_dentry, 670 rc = lower_dentry->d_inode->i_op->readlink(lower_dentry,
633 (char __user *)lower_buf, 671 (char __user *)lower_buf,
634 bufsiz); 672 bufsiz);
635 set_fs(old_fs); 673 set_fs(old_fs);
636 if (rc >= 0) { 674 if (rc >= 0) {
637 crypt_stat = NULL; 675 rc = ecryptfs_decode_and_decrypt_filename(&plaintext_name,
638 rc = ecryptfs_decode_filename(crypt_stat, lower_buf, rc, 676 &plaintext_name_size,
639 &decoded_name); 677 dentry, lower_buf,
640 if (rc == -ENOMEM) 678 rc);
679 if (rc) {
680 printk(KERN_ERR "%s: Error attempting to decode and "
681 "decrypt filename; rc = [%d]\n", __func__,
682 rc);
641 goto out_free_lower_buf; 683 goto out_free_lower_buf;
642 if (rc > 0) {
643 ecryptfs_printk(KERN_DEBUG, "Copying [%d] bytes "
644 "to userspace: [%*s]\n", rc,
645 decoded_name);
646 if (copy_to_user(buf, decoded_name, rc))
647 rc = -EFAULT;
648 } 684 }
649 kfree(decoded_name); 685 rc = copy_to_user(buf, plaintext_name, plaintext_name_size);
650 fsstack_copy_attr_atime(dentry->d_inode, 686 if (rc)
651 lower_dentry->d_inode); 687 rc = -EFAULT;
688 else
689 rc = plaintext_name_size;
690 kfree(plaintext_name);
691 fsstack_copy_attr_atime(dentry->d_inode, lower_dentry->d_inode);
652 } 692 }
653out_free_lower_buf: 693out_free_lower_buf:
654 kfree(lower_buf); 694 kfree(lower_buf);
@@ -670,13 +710,12 @@ static void *ecryptfs_follow_link(struct dentry *dentry, struct nameidata *nd)
670 } 710 }
671 old_fs = get_fs(); 711 old_fs = get_fs();
672 set_fs(get_ds()); 712 set_fs(get_ds());
673 ecryptfs_printk(KERN_DEBUG, "Calling readlink w/ "
674 "dentry->d_name.name = [%s]\n", dentry->d_name.name);
675 rc = dentry->d_inode->i_op->readlink(dentry, (char __user *)buf, len); 713 rc = dentry->d_inode->i_op->readlink(dentry, (char __user *)buf, len);
676 buf[rc] = '\0';
677 set_fs(old_fs); 714 set_fs(old_fs);
678 if (rc < 0) 715 if (rc < 0)
679 goto out_free; 716 goto out_free;
717 else
718 buf[rc] = '\0';
680 rc = 0; 719 rc = 0;
681 nd_set_link(nd, buf); 720 nd_set_link(nd, buf);
682 goto out; 721 goto out;
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 0d713b691941..ff539420cc6f 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -358,7 +358,7 @@ parse_tag_67_packet(struct ecryptfs_key_record *key_rec,
358 /* verify that everything through the encrypted FEK size is present */ 358 /* verify that everything through the encrypted FEK size is present */
359 if (message_len < 4) { 359 if (message_len < 4) {
360 rc = -EIO; 360 rc = -EIO;
361 printk(KERN_ERR "%s: message_len is [%Zd]; minimum acceptable " 361 printk(KERN_ERR "%s: message_len is [%zd]; minimum acceptable "
362 "message length is [%d]\n", __func__, message_len, 4); 362 "message length is [%d]\n", __func__, message_len, 4);
363 goto out; 363 goto out;
364 } 364 }
@@ -385,13 +385,13 @@ parse_tag_67_packet(struct ecryptfs_key_record *key_rec,
385 i += data_len; 385 i += data_len;
386 if (message_len < (i + key_rec->enc_key_size)) { 386 if (message_len < (i + key_rec->enc_key_size)) {
387 rc = -EIO; 387 rc = -EIO;
388 printk(KERN_ERR "%s: message_len [%Zd]; max len is [%Zd]\n", 388 printk(KERN_ERR "%s: message_len [%zd]; max len is [%zd]\n",
389 __func__, message_len, (i + key_rec->enc_key_size)); 389 __func__, message_len, (i + key_rec->enc_key_size));
390 goto out; 390 goto out;
391 } 391 }
392 if (key_rec->enc_key_size > ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES) { 392 if (key_rec->enc_key_size > ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES) {
393 rc = -EIO; 393 rc = -EIO;
394 printk(KERN_ERR "%s: Encrypted key_size [%Zd] larger than " 394 printk(KERN_ERR "%s: Encrypted key_size [%zd] larger than "
395 "the maximum key size [%d]\n", __func__, 395 "the maximum key size [%d]\n", __func__,
396 key_rec->enc_key_size, 396 key_rec->enc_key_size,
397 ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES); 397 ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES);
@@ -403,6 +403,580 @@ out:
403} 403}
404 404
405static int 405static int
406ecryptfs_find_global_auth_tok_for_sig(
407 struct ecryptfs_global_auth_tok **global_auth_tok,
408 struct ecryptfs_mount_crypt_stat *mount_crypt_stat, char *sig)
409{
410 struct ecryptfs_global_auth_tok *walker;
411 int rc = 0;
412
413 (*global_auth_tok) = NULL;
414 mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex);
415 list_for_each_entry(walker,
416 &mount_crypt_stat->global_auth_tok_list,
417 mount_crypt_stat_list) {
418 if (memcmp(walker->sig, sig, ECRYPTFS_SIG_SIZE_HEX) == 0) {
419 (*global_auth_tok) = walker;
420 goto out;
421 }
422 }
423 rc = -EINVAL;
424out:
425 mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex);
426 return rc;
427}
428
429/**
430 * ecryptfs_find_auth_tok_for_sig
431 * @auth_tok: Set to the matching auth_tok; NULL if not found
432 * @crypt_stat: inode crypt_stat crypto context
433 * @sig: Sig of auth_tok to find
434 *
435 * For now, this function simply looks at the registered auth_tok's
436 * linked off the mount_crypt_stat, so all the auth_toks that can be
437 * used must be registered at mount time. This function could
438 * potentially try a lot harder to find auth_tok's (e.g., by calling
439 * out to ecryptfsd to dynamically retrieve an auth_tok object) so
440 * that static registration of auth_tok's will no longer be necessary.
441 *
442 * Returns zero on no error; non-zero on error
443 */
444static int
445ecryptfs_find_auth_tok_for_sig(
446 struct ecryptfs_auth_tok **auth_tok,
447 struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
448 char *sig)
449{
450 struct ecryptfs_global_auth_tok *global_auth_tok;
451 int rc = 0;
452
453 (*auth_tok) = NULL;
454 if (ecryptfs_find_global_auth_tok_for_sig(&global_auth_tok,
455 mount_crypt_stat, sig)) {
456 struct key *auth_tok_key;
457
458 rc = ecryptfs_keyring_auth_tok_for_sig(&auth_tok_key, auth_tok,
459 sig);
460 } else
461 (*auth_tok) = global_auth_tok->global_auth_tok;
462 return rc;
463}
464
465/**
466 * write_tag_70_packet can gobble a lot of stack space. We stuff most
467 * of the function's parameters in a kmalloc'd struct to help reduce
468 * eCryptfs' overall stack usage.
469 */
470struct ecryptfs_write_tag_70_packet_silly_stack {
471 u8 cipher_code;
472 size_t max_packet_size;
473 size_t packet_size_len;
474 size_t block_aligned_filename_size;
475 size_t block_size;
476 size_t i;
477 size_t j;
478 size_t num_rand_bytes;
479 struct mutex *tfm_mutex;
480 char *block_aligned_filename;
481 struct ecryptfs_auth_tok *auth_tok;
482 struct scatterlist src_sg;
483 struct scatterlist dst_sg;
484 struct blkcipher_desc desc;
485 char iv[ECRYPTFS_MAX_IV_BYTES];
486 char hash[ECRYPTFS_TAG_70_DIGEST_SIZE];
487 char tmp_hash[ECRYPTFS_TAG_70_DIGEST_SIZE];
488 struct hash_desc hash_desc;
489 struct scatterlist hash_sg;
490};
491
492/**
493 * write_tag_70_packet - Write encrypted filename (EFN) packet against FNEK
494 * @filename: NULL-terminated filename string
495 *
496 * This is the simplest mechanism for achieving filename encryption in
497 * eCryptfs. It encrypts the given filename with the mount-wide
498 * filename encryption key (FNEK) and stores it in a packet to @dest,
499 * which the callee will encode and write directly into the dentry
500 * name.
501 */
502int
503ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
504 size_t *packet_size,
505 struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
506 char *filename, size_t filename_size)
507{
508 struct ecryptfs_write_tag_70_packet_silly_stack *s;
509 int rc = 0;
510
511 s = kmalloc(sizeof(*s), GFP_KERNEL);
512 if (!s) {
513 printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc "
514 "[%zd] bytes of kernel memory\n", __func__, sizeof(*s));
515 goto out;
516 }
517 s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
518 (*packet_size) = 0;
519 rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(
520 &s->desc.tfm,
521 &s->tfm_mutex, mount_crypt_stat->global_default_fn_cipher_name);
522 if (unlikely(rc)) {
523 printk(KERN_ERR "Internal error whilst attempting to get "
524 "tfm and mutex for cipher name [%s]; rc = [%d]\n",
525 mount_crypt_stat->global_default_fn_cipher_name, rc);
526 goto out;
527 }
528 mutex_lock(s->tfm_mutex);
529 s->block_size = crypto_blkcipher_blocksize(s->desc.tfm);
530 /* Plus one for the \0 separator between the random prefix
531 * and the plaintext filename */
532 s->num_rand_bytes = (ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES + 1);
533 s->block_aligned_filename_size = (s->num_rand_bytes + filename_size);
534 if ((s->block_aligned_filename_size % s->block_size) != 0) {
535 s->num_rand_bytes += (s->block_size
536 - (s->block_aligned_filename_size
537 % s->block_size));
538 s->block_aligned_filename_size = (s->num_rand_bytes
539 + filename_size);
540 }
541 /* Octet 0: Tag 70 identifier
542 * Octets 1-N1: Tag 70 packet size (includes cipher identifier
543 * and block-aligned encrypted filename size)
544 * Octets N1-N2: FNEK sig (ECRYPTFS_SIG_SIZE)
545 * Octet N2-N3: Cipher identifier (1 octet)
546 * Octets N3-N4: Block-aligned encrypted filename
547 * - Consists of a minimum number of random characters, a \0
548 * separator, and then the filename */
549 s->max_packet_size = (1 /* Tag 70 identifier */
550 + 3 /* Max Tag 70 packet size */
551 + ECRYPTFS_SIG_SIZE /* FNEK sig */
552 + 1 /* Cipher identifier */
553 + s->block_aligned_filename_size);
554 if (dest == NULL) {
555 (*packet_size) = s->max_packet_size;
556 goto out_unlock;
557 }
558 if (s->max_packet_size > (*remaining_bytes)) {
559 printk(KERN_WARNING "%s: Require [%zd] bytes to write; only "
560 "[%zd] available\n", __func__, s->max_packet_size,
561 (*remaining_bytes));
562 rc = -EINVAL;
563 goto out_unlock;
564 }
565 s->block_aligned_filename = kzalloc(s->block_aligned_filename_size,
566 GFP_KERNEL);
567 if (!s->block_aligned_filename) {
568 printk(KERN_ERR "%s: Out of kernel memory whilst attempting to "
569 "kzalloc [%zd] bytes\n", __func__,
570 s->block_aligned_filename_size);
571 rc = -ENOMEM;
572 goto out_unlock;
573 }
574 s->i = 0;
575 dest[s->i++] = ECRYPTFS_TAG_70_PACKET_TYPE;
576 rc = ecryptfs_write_packet_length(&dest[s->i],
577 (ECRYPTFS_SIG_SIZE
578 + 1 /* Cipher code */
579 + s->block_aligned_filename_size),
580 &s->packet_size_len);
581 if (rc) {
582 printk(KERN_ERR "%s: Error generating tag 70 packet "
583 "header; cannot generate packet length; rc = [%d]\n",
584 __func__, rc);
585 goto out_free_unlock;
586 }
587 s->i += s->packet_size_len;
588 ecryptfs_from_hex(&dest[s->i],
589 mount_crypt_stat->global_default_fnek_sig,
590 ECRYPTFS_SIG_SIZE);
591 s->i += ECRYPTFS_SIG_SIZE;
592 s->cipher_code = ecryptfs_code_for_cipher_string(
593 mount_crypt_stat->global_default_fn_cipher_name,
594 mount_crypt_stat->global_default_fn_cipher_key_bytes);
595 if (s->cipher_code == 0) {
596 printk(KERN_WARNING "%s: Unable to generate code for "
597 "cipher [%s] with key bytes [%zd]\n", __func__,
598 mount_crypt_stat->global_default_fn_cipher_name,
599 mount_crypt_stat->global_default_fn_cipher_key_bytes);
600 rc = -EINVAL;
601 goto out_free_unlock;
602 }
603 dest[s->i++] = s->cipher_code;
604 rc = ecryptfs_find_auth_tok_for_sig(
605 &s->auth_tok, mount_crypt_stat,
606 mount_crypt_stat->global_default_fnek_sig);
607 if (rc) {
608 printk(KERN_ERR "%s: Error attempting to find auth tok for "
609 "fnek sig [%s]; rc = [%d]\n", __func__,
610 mount_crypt_stat->global_default_fnek_sig, rc);
611 goto out_free_unlock;
612 }
613 /* TODO: Support other key modules than passphrase for
614 * filename encryption */
615 BUG_ON(s->auth_tok->token_type != ECRYPTFS_PASSWORD);
616 sg_init_one(
617 &s->hash_sg,
618 (u8 *)s->auth_tok->token.password.session_key_encryption_key,
619 s->auth_tok->token.password.session_key_encryption_key_bytes);
620 s->hash_desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
621 s->hash_desc.tfm = crypto_alloc_hash(ECRYPTFS_TAG_70_DIGEST, 0,
622 CRYPTO_ALG_ASYNC);
623 if (IS_ERR(s->hash_desc.tfm)) {
624 rc = PTR_ERR(s->hash_desc.tfm);
625 printk(KERN_ERR "%s: Error attempting to "
626 "allocate hash crypto context; rc = [%d]\n",
627 __func__, rc);
628 goto out_free_unlock;
629 }
630 rc = crypto_hash_init(&s->hash_desc);
631 if (rc) {
632 printk(KERN_ERR
633 "%s: Error initializing crypto hash; rc = [%d]\n",
634 __func__, rc);
635 goto out_release_free_unlock;
636 }
637 rc = crypto_hash_update(
638 &s->hash_desc, &s->hash_sg,
639 s->auth_tok->token.password.session_key_encryption_key_bytes);
640 if (rc) {
641 printk(KERN_ERR
642 "%s: Error updating crypto hash; rc = [%d]\n",
643 __func__, rc);
644 goto out_release_free_unlock;
645 }
646 rc = crypto_hash_final(&s->hash_desc, s->hash);
647 if (rc) {
648 printk(KERN_ERR
649 "%s: Error finalizing crypto hash; rc = [%d]\n",
650 __func__, rc);
651 goto out_release_free_unlock;
652 }
653 for (s->j = 0; s->j < (s->num_rand_bytes - 1); s->j++) {
654 s->block_aligned_filename[s->j] =
655 s->hash[(s->j % ECRYPTFS_TAG_70_DIGEST_SIZE)];
656 if ((s->j % ECRYPTFS_TAG_70_DIGEST_SIZE)
657 == (ECRYPTFS_TAG_70_DIGEST_SIZE - 1)) {
658 sg_init_one(&s->hash_sg, (u8 *)s->hash,
659 ECRYPTFS_TAG_70_DIGEST_SIZE);
660 rc = crypto_hash_init(&s->hash_desc);
661 if (rc) {
662 printk(KERN_ERR
663 "%s: Error initializing crypto hash; "
664 "rc = [%d]\n", __func__, rc);
665 goto out_release_free_unlock;
666 }
667 rc = crypto_hash_update(&s->hash_desc, &s->hash_sg,
668 ECRYPTFS_TAG_70_DIGEST_SIZE);
669 if (rc) {
670 printk(KERN_ERR
671 "%s: Error updating crypto hash; "
672 "rc = [%d]\n", __func__, rc);
673 goto out_release_free_unlock;
674 }
675 rc = crypto_hash_final(&s->hash_desc, s->tmp_hash);
676 if (rc) {
677 printk(KERN_ERR
678 "%s: Error finalizing crypto hash; "
679 "rc = [%d]\n", __func__, rc);
680 goto out_release_free_unlock;
681 }
682 memcpy(s->hash, s->tmp_hash,
683 ECRYPTFS_TAG_70_DIGEST_SIZE);
684 }
685 if (s->block_aligned_filename[s->j] == '\0')
686 s->block_aligned_filename[s->j] = ECRYPTFS_NON_NULL;
687 }
688 memcpy(&s->block_aligned_filename[s->num_rand_bytes], filename,
689 filename_size);
690 rc = virt_to_scatterlist(s->block_aligned_filename,
691 s->block_aligned_filename_size, &s->src_sg, 1);
692 if (rc != 1) {
693 printk(KERN_ERR "%s: Internal error whilst attempting to "
694 "convert filename memory to scatterlist; "
695 "expected rc = 1; got rc = [%d]. "
696 "block_aligned_filename_size = [%zd]\n", __func__, rc,
697 s->block_aligned_filename_size);
698 goto out_release_free_unlock;
699 }
700 rc = virt_to_scatterlist(&dest[s->i], s->block_aligned_filename_size,
701 &s->dst_sg, 1);
702 if (rc != 1) {
703 printk(KERN_ERR "%s: Internal error whilst attempting to "
704 "convert encrypted filename memory to scatterlist; "
705 "expected rc = 1; got rc = [%d]. "
706 "block_aligned_filename_size = [%zd]\n", __func__, rc,
707 s->block_aligned_filename_size);
708 goto out_release_free_unlock;
709 }
710 /* The characters in the first block effectively do the job
711 * of the IV here, so we just use 0's for the IV. Note the
712 * constraint that ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES
713 * >= ECRYPTFS_MAX_IV_BYTES. */
714 memset(s->iv, 0, ECRYPTFS_MAX_IV_BYTES);
715 s->desc.info = s->iv;
716 rc = crypto_blkcipher_setkey(
717 s->desc.tfm,
718 s->auth_tok->token.password.session_key_encryption_key,
719 mount_crypt_stat->global_default_fn_cipher_key_bytes);
720 if (rc < 0) {
721 printk(KERN_ERR "%s: Error setting key for crypto context; "
722 "rc = [%d]. s->auth_tok->token.password.session_key_"
723 "encryption_key = [0x%p]; mount_crypt_stat->"
724 "global_default_fn_cipher_key_bytes = [%zd]\n", __func__,
725 rc,
726 s->auth_tok->token.password.session_key_encryption_key,
727 mount_crypt_stat->global_default_fn_cipher_key_bytes);
728 goto out_release_free_unlock;
729 }
730 rc = crypto_blkcipher_encrypt_iv(&s->desc, &s->dst_sg, &s->src_sg,
731 s->block_aligned_filename_size);
732 if (rc) {
733 printk(KERN_ERR "%s: Error attempting to encrypt filename; "
734 "rc = [%d]\n", __func__, rc);
735 goto out_release_free_unlock;
736 }
737 s->i += s->block_aligned_filename_size;
738 (*packet_size) = s->i;
739 (*remaining_bytes) -= (*packet_size);
740out_release_free_unlock:
741 crypto_free_hash(s->hash_desc.tfm);
742out_free_unlock:
743 memset(s->block_aligned_filename, 0, s->block_aligned_filename_size);
744 kfree(s->block_aligned_filename);
745out_unlock:
746 mutex_unlock(s->tfm_mutex);
747out:
748 kfree(s);
749 return rc;
750}
751
752struct ecryptfs_parse_tag_70_packet_silly_stack {
753 u8 cipher_code;
754 size_t max_packet_size;
755 size_t packet_size_len;
756 size_t parsed_tag_70_packet_size;
757 size_t block_aligned_filename_size;
758 size_t block_size;
759 size_t i;
760 struct mutex *tfm_mutex;
761 char *decrypted_filename;
762 struct ecryptfs_auth_tok *auth_tok;
763 struct scatterlist src_sg;
764 struct scatterlist dst_sg;
765 struct blkcipher_desc desc;
766 char fnek_sig_hex[ECRYPTFS_SIG_SIZE_HEX + 1];
767 char iv[ECRYPTFS_MAX_IV_BYTES];
768 char cipher_string[ECRYPTFS_MAX_CIPHER_NAME_SIZE];
769};
770
771/**
772 * parse_tag_70_packet - Parse and process FNEK-encrypted passphrase packet
773 * @filename: This function kmalloc's the memory for the filename
774 * @filename_size: This function sets this to the amount of memory
775 * kmalloc'd for the filename
776 * @packet_size: This function sets this to the the number of octets
777 * in the packet parsed
778 * @mount_crypt_stat: The mount-wide cryptographic context
779 * @data: The memory location containing the start of the tag 70
780 * packet
781 * @max_packet_size: The maximum legal size of the packet to be parsed
782 * from @data
783 *
784 * Returns zero on success; non-zero otherwise
785 */
786int
787ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
788 size_t *packet_size,
789 struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
790 char *data, size_t max_packet_size)
791{
792 struct ecryptfs_parse_tag_70_packet_silly_stack *s;
793 int rc = 0;
794
795 (*packet_size) = 0;
796 (*filename_size) = 0;
797 (*filename) = NULL;
798 s = kmalloc(sizeof(*s), GFP_KERNEL);
799 if (!s) {
800 printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc "
801 "[%zd] bytes of kernel memory\n", __func__, sizeof(*s));
802 goto out;
803 }
804 s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
805 if (max_packet_size < (1 + 1 + ECRYPTFS_SIG_SIZE + 1 + 1)) {
806 printk(KERN_WARNING "%s: max_packet_size is [%zd]; it must be "
807 "at least [%d]\n", __func__, max_packet_size,
808 (1 + 1 + ECRYPTFS_SIG_SIZE + 1 + 1));
809 rc = -EINVAL;
810 goto out;
811 }
812 /* Octet 0: Tag 70 identifier
813 * Octets 1-N1: Tag 70 packet size (includes cipher identifier
814 * and block-aligned encrypted filename size)
815 * Octets N1-N2: FNEK sig (ECRYPTFS_SIG_SIZE)
816 * Octet N2-N3: Cipher identifier (1 octet)
817 * Octets N3-N4: Block-aligned encrypted filename
818 * - Consists of a minimum number of random numbers, a \0
819 * separator, and then the filename */
820 if (data[(*packet_size)++] != ECRYPTFS_TAG_70_PACKET_TYPE) {
821 printk(KERN_WARNING "%s: Invalid packet tag [0x%.2x]; must be "
822 "tag [0x%.2x]\n", __func__,
823 data[((*packet_size) - 1)], ECRYPTFS_TAG_70_PACKET_TYPE);
824 rc = -EINVAL;
825 goto out;
826 }
827 rc = ecryptfs_parse_packet_length(&data[(*packet_size)],
828 &s->parsed_tag_70_packet_size,
829 &s->packet_size_len);
830 if (rc) {
831 printk(KERN_WARNING "%s: Error parsing packet length; "
832 "rc = [%d]\n", __func__, rc);
833 goto out;
834 }
835 s->block_aligned_filename_size = (s->parsed_tag_70_packet_size
836 - ECRYPTFS_SIG_SIZE - 1);
837 if ((1 + s->packet_size_len + s->parsed_tag_70_packet_size)
838 > max_packet_size) {
839 printk(KERN_WARNING "%s: max_packet_size is [%zd]; real packet "
840 "size is [%zd]\n", __func__, max_packet_size,
841 (1 + s->packet_size_len + 1
842 + s->block_aligned_filename_size));
843 rc = -EINVAL;
844 goto out;
845 }
846 (*packet_size) += s->packet_size_len;
847 ecryptfs_to_hex(s->fnek_sig_hex, &data[(*packet_size)],
848 ECRYPTFS_SIG_SIZE);
849 s->fnek_sig_hex[ECRYPTFS_SIG_SIZE_HEX] = '\0';
850 (*packet_size) += ECRYPTFS_SIG_SIZE;
851 s->cipher_code = data[(*packet_size)++];
852 rc = ecryptfs_cipher_code_to_string(s->cipher_string, s->cipher_code);
853 if (rc) {
854 printk(KERN_WARNING "%s: Cipher code [%d] is invalid\n",
855 __func__, s->cipher_code);
856 goto out;
857 }
858 rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&s->desc.tfm,
859 &s->tfm_mutex,
860 s->cipher_string);
861 if (unlikely(rc)) {
862 printk(KERN_ERR "Internal error whilst attempting to get "
863 "tfm and mutex for cipher name [%s]; rc = [%d]\n",
864 s->cipher_string, rc);
865 goto out;
866 }
867 mutex_lock(s->tfm_mutex);
868 rc = virt_to_scatterlist(&data[(*packet_size)],
869 s->block_aligned_filename_size, &s->src_sg, 1);
870 if (rc != 1) {
871 printk(KERN_ERR "%s: Internal error whilst attempting to "
872 "convert encrypted filename memory to scatterlist; "
873 "expected rc = 1; got rc = [%d]. "
874 "block_aligned_filename_size = [%zd]\n", __func__, rc,
875 s->block_aligned_filename_size);
876 goto out_unlock;
877 }
878 (*packet_size) += s->block_aligned_filename_size;
879 s->decrypted_filename = kmalloc(s->block_aligned_filename_size,
880 GFP_KERNEL);
881 if (!s->decrypted_filename) {
882 printk(KERN_ERR "%s: Out of memory whilst attempting to "
883 "kmalloc [%zd] bytes\n", __func__,
884 s->block_aligned_filename_size);
885 rc = -ENOMEM;
886 goto out_unlock;
887 }
888 rc = virt_to_scatterlist(s->decrypted_filename,
889 s->block_aligned_filename_size, &s->dst_sg, 1);
890 if (rc != 1) {
891 printk(KERN_ERR "%s: Internal error whilst attempting to "
892 "convert decrypted filename memory to scatterlist; "
893 "expected rc = 1; got rc = [%d]. "
894 "block_aligned_filename_size = [%zd]\n", __func__, rc,
895 s->block_aligned_filename_size);
896 goto out_free_unlock;
897 }
898 /* The characters in the first block effectively do the job of
899 * the IV here, so we just use 0's for the IV. Note the
900 * constraint that ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES
901 * >= ECRYPTFS_MAX_IV_BYTES. */
902 memset(s->iv, 0, ECRYPTFS_MAX_IV_BYTES);
903 s->desc.info = s->iv;
904 rc = ecryptfs_find_auth_tok_for_sig(&s->auth_tok, mount_crypt_stat,
905 s->fnek_sig_hex);
906 if (rc) {
907 printk(KERN_ERR "%s: Error attempting to find auth tok for "
908 "fnek sig [%s]; rc = [%d]\n", __func__, s->fnek_sig_hex,
909 rc);
910 goto out_free_unlock;
911 }
912 /* TODO: Support other key modules than passphrase for
913 * filename encryption */
914 BUG_ON(s->auth_tok->token_type != ECRYPTFS_PASSWORD);
915 rc = crypto_blkcipher_setkey(
916 s->desc.tfm,
917 s->auth_tok->token.password.session_key_encryption_key,
918 mount_crypt_stat->global_default_fn_cipher_key_bytes);
919 if (rc < 0) {
920 printk(KERN_ERR "%s: Error setting key for crypto context; "
921 "rc = [%d]. s->auth_tok->token.password.session_key_"
922 "encryption_key = [0x%p]; mount_crypt_stat->"
923 "global_default_fn_cipher_key_bytes = [%zd]\n", __func__,
924 rc,
925 s->auth_tok->token.password.session_key_encryption_key,
926 mount_crypt_stat->global_default_fn_cipher_key_bytes);
927 goto out_free_unlock;
928 }
929 rc = crypto_blkcipher_decrypt_iv(&s->desc, &s->dst_sg, &s->src_sg,
930 s->block_aligned_filename_size);
931 if (rc) {
932 printk(KERN_ERR "%s: Error attempting to decrypt filename; "
933 "rc = [%d]\n", __func__, rc);
934 goto out_free_unlock;
935 }
936 s->i = 0;
937 while (s->decrypted_filename[s->i] != '\0'
938 && s->i < s->block_aligned_filename_size)
939 s->i++;
940 if (s->i == s->block_aligned_filename_size) {
941 printk(KERN_WARNING "%s: Invalid tag 70 packet; could not "
942 "find valid separator between random characters and "
943 "the filename\n", __func__);
944 rc = -EINVAL;
945 goto out_free_unlock;
946 }
947 s->i++;
948 (*filename_size) = (s->block_aligned_filename_size - s->i);
949 if (!((*filename_size) > 0 && (*filename_size < PATH_MAX))) {
950 printk(KERN_WARNING "%s: Filename size is [%zd], which is "
951 "invalid\n", __func__, (*filename_size));
952 rc = -EINVAL;
953 goto out_free_unlock;
954 }
955 (*filename) = kmalloc(((*filename_size) + 1), GFP_KERNEL);
956 if (!(*filename)) {
957 printk(KERN_ERR "%s: Out of memory whilst attempting to "
958 "kmalloc [%zd] bytes\n", __func__,
959 ((*filename_size) + 1));
960 rc = -ENOMEM;
961 goto out_free_unlock;
962 }
963 memcpy((*filename), &s->decrypted_filename[s->i], (*filename_size));
964 (*filename)[(*filename_size)] = '\0';
965out_free_unlock:
966 kfree(s->decrypted_filename);
967out_unlock:
968 mutex_unlock(s->tfm_mutex);
969out:
970 if (rc) {
971 (*packet_size) = 0;
972 (*filename_size) = 0;
973 (*filename) = NULL;
974 }
975 kfree(s);
976 return rc;
977}
978
979static int
406ecryptfs_get_auth_tok_sig(char **sig, struct ecryptfs_auth_tok *auth_tok) 980ecryptfs_get_auth_tok_sig(char **sig, struct ecryptfs_auth_tok *auth_tok)
407{ 981{
408 int rc = 0; 982 int rc = 0;
@@ -897,30 +1471,6 @@ out:
897 return rc; 1471 return rc;
898} 1472}
899 1473
900static int
901ecryptfs_find_global_auth_tok_for_sig(
902 struct ecryptfs_global_auth_tok **global_auth_tok,
903 struct ecryptfs_mount_crypt_stat *mount_crypt_stat, char *sig)
904{
905 struct ecryptfs_global_auth_tok *walker;
906 int rc = 0;
907
908 (*global_auth_tok) = NULL;
909 mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex);
910 list_for_each_entry(walker,
911 &mount_crypt_stat->global_auth_tok_list,
912 mount_crypt_stat_list) {
913 if (memcmp(walker->sig, sig, ECRYPTFS_SIG_SIZE_HEX) == 0) {
914 (*global_auth_tok) = walker;
915 goto out;
916 }
917 }
918 rc = -EINVAL;
919out:
920 mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex);
921 return rc;
922}
923
924/** 1474/**
925 * ecryptfs_verify_version 1475 * ecryptfs_verify_version
926 * @version: The version number to confirm 1476 * @version: The version number to confirm
@@ -990,43 +1540,6 @@ out:
990} 1540}
991 1541
992/** 1542/**
993 * ecryptfs_find_auth_tok_for_sig
994 * @auth_tok: Set to the matching auth_tok; NULL if not found
995 * @crypt_stat: inode crypt_stat crypto context
996 * @sig: Sig of auth_tok to find
997 *
998 * For now, this function simply looks at the registered auth_tok's
999 * linked off the mount_crypt_stat, so all the auth_toks that can be
1000 * used must be registered at mount time. This function could
1001 * potentially try a lot harder to find auth_tok's (e.g., by calling
1002 * out to ecryptfsd to dynamically retrieve an auth_tok object) so
1003 * that static registration of auth_tok's will no longer be necessary.
1004 *
1005 * Returns zero on no error; non-zero on error
1006 */
1007static int
1008ecryptfs_find_auth_tok_for_sig(
1009 struct ecryptfs_auth_tok **auth_tok,
1010 struct ecryptfs_crypt_stat *crypt_stat, char *sig)
1011{
1012 struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
1013 crypt_stat->mount_crypt_stat;
1014 struct ecryptfs_global_auth_tok *global_auth_tok;
1015 int rc = 0;
1016
1017 (*auth_tok) = NULL;
1018 if (ecryptfs_find_global_auth_tok_for_sig(&global_auth_tok,
1019 mount_crypt_stat, sig)) {
1020 struct key *auth_tok_key;
1021
1022 rc = ecryptfs_keyring_auth_tok_for_sig(&auth_tok_key, auth_tok,
1023 sig);
1024 } else
1025 (*auth_tok) = global_auth_tok->global_auth_tok;
1026 return rc;
1027}
1028
1029/**
1030 * decrypt_passphrase_encrypted_session_key - Decrypt the session key with the given auth_tok. 1543 * decrypt_passphrase_encrypted_session_key - Decrypt the session key with the given auth_tok.
1031 * @auth_tok: The passphrase authentication token to use to encrypt the FEK 1544 * @auth_tok: The passphrase authentication token to use to encrypt the FEK
1032 * @crypt_stat: The cryptographic context 1545 * @crypt_stat: The cryptographic context
@@ -1256,7 +1769,8 @@ find_next_matching_auth_tok:
1256 rc = -EINVAL; 1769 rc = -EINVAL;
1257 goto out_wipe_list; 1770 goto out_wipe_list;
1258 } 1771 }
1259 ecryptfs_find_auth_tok_for_sig(&matching_auth_tok, crypt_stat, 1772 ecryptfs_find_auth_tok_for_sig(&matching_auth_tok,
1773 crypt_stat->mount_crypt_stat,
1260 candidate_auth_tok_sig); 1774 candidate_auth_tok_sig);
1261 if (matching_auth_tok) { 1775 if (matching_auth_tok) {
1262 found_auth_tok = 1; 1776 found_auth_tok = 1;
@@ -1336,7 +1850,9 @@ pki_encrypt_session_key(struct ecryptfs_auth_tok *auth_tok,
1336 int rc; 1850 int rc;
1337 1851
1338 rc = write_tag_66_packet(auth_tok->token.private_key.signature, 1852 rc = write_tag_66_packet(auth_tok->token.private_key.signature,
1339 ecryptfs_code_for_cipher_string(crypt_stat), 1853 ecryptfs_code_for_cipher_string(
1854 crypt_stat->cipher,
1855 crypt_stat->key_size),
1340 crypt_stat, &payload, &payload_len); 1856 crypt_stat, &payload, &payload_len);
1341 if (rc) { 1857 if (rc) {
1342 ecryptfs_printk(KERN_ERR, "Error generating tag 66 packet\n"); 1858 ecryptfs_printk(KERN_ERR, "Error generating tag 66 packet\n");
@@ -1696,7 +2212,8 @@ encrypted_session_key_set:
1696 dest[(*packet_size)++] = 0x04; /* version 4 */ 2212 dest[(*packet_size)++] = 0x04; /* version 4 */
1697 /* TODO: Break from RFC2440 so that arbitrary ciphers can be 2213 /* TODO: Break from RFC2440 so that arbitrary ciphers can be
1698 * specified with strings */ 2214 * specified with strings */
1699 cipher_code = ecryptfs_code_for_cipher_string(crypt_stat); 2215 cipher_code = ecryptfs_code_for_cipher_string(crypt_stat->cipher,
2216 crypt_stat->key_size);
1700 if (cipher_code == 0) { 2217 if (cipher_code == 0) {
1701 ecryptfs_printk(KERN_WARNING, "Unable to generate code for " 2218 ecryptfs_printk(KERN_WARNING, "Unable to generate code for "
1702 "cipher [%s]\n", crypt_stat->cipher); 2219 "cipher [%s]\n", crypt_stat->cipher);
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index fd630713c5c7..789cf2e1be1e 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -206,7 +206,9 @@ enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig,
206 ecryptfs_opt_cipher, ecryptfs_opt_ecryptfs_cipher, 206 ecryptfs_opt_cipher, ecryptfs_opt_ecryptfs_cipher,
207 ecryptfs_opt_ecryptfs_key_bytes, 207 ecryptfs_opt_ecryptfs_key_bytes,
208 ecryptfs_opt_passthrough, ecryptfs_opt_xattr_metadata, 208 ecryptfs_opt_passthrough, ecryptfs_opt_xattr_metadata,
209 ecryptfs_opt_encrypted_view, ecryptfs_opt_err }; 209 ecryptfs_opt_encrypted_view, ecryptfs_opt_fnek_sig,
210 ecryptfs_opt_fn_cipher, ecryptfs_opt_fn_cipher_key_bytes,
211 ecryptfs_opt_err };
210 212
211static const match_table_t tokens = { 213static const match_table_t tokens = {
212 {ecryptfs_opt_sig, "sig=%s"}, 214 {ecryptfs_opt_sig, "sig=%s"},
@@ -217,6 +219,9 @@ static const match_table_t tokens = {
217 {ecryptfs_opt_passthrough, "ecryptfs_passthrough"}, 219 {ecryptfs_opt_passthrough, "ecryptfs_passthrough"},
218 {ecryptfs_opt_xattr_metadata, "ecryptfs_xattr_metadata"}, 220 {ecryptfs_opt_xattr_metadata, "ecryptfs_xattr_metadata"},
219 {ecryptfs_opt_encrypted_view, "ecryptfs_encrypted_view"}, 221 {ecryptfs_opt_encrypted_view, "ecryptfs_encrypted_view"},
222 {ecryptfs_opt_fnek_sig, "ecryptfs_fnek_sig=%s"},
223 {ecryptfs_opt_fn_cipher, "ecryptfs_fn_cipher=%s"},
224 {ecryptfs_opt_fn_cipher_key_bytes, "ecryptfs_fn_key_bytes=%u"},
220 {ecryptfs_opt_err, NULL} 225 {ecryptfs_opt_err, NULL}
221}; 226};
222 227
@@ -281,8 +286,11 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
281 int rc = 0; 286 int rc = 0;
282 int sig_set = 0; 287 int sig_set = 0;
283 int cipher_name_set = 0; 288 int cipher_name_set = 0;
289 int fn_cipher_name_set = 0;
284 int cipher_key_bytes; 290 int cipher_key_bytes;
285 int cipher_key_bytes_set = 0; 291 int cipher_key_bytes_set = 0;
292 int fn_cipher_key_bytes;
293 int fn_cipher_key_bytes_set = 0;
286 struct ecryptfs_mount_crypt_stat *mount_crypt_stat = 294 struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
287 &ecryptfs_superblock_to_private(sb)->mount_crypt_stat; 295 &ecryptfs_superblock_to_private(sb)->mount_crypt_stat;
288 substring_t args[MAX_OPT_ARGS]; 296 substring_t args[MAX_OPT_ARGS];
@@ -290,7 +298,12 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
290 char *sig_src; 298 char *sig_src;
291 char *cipher_name_dst; 299 char *cipher_name_dst;
292 char *cipher_name_src; 300 char *cipher_name_src;
301 char *fn_cipher_name_dst;
302 char *fn_cipher_name_src;
303 char *fnek_dst;
304 char *fnek_src;
293 char *cipher_key_bytes_src; 305 char *cipher_key_bytes_src;
306 char *fn_cipher_key_bytes_src;
294 307
295 if (!options) { 308 if (!options) {
296 rc = -EINVAL; 309 rc = -EINVAL;
@@ -322,10 +335,7 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
322 global_default_cipher_name; 335 global_default_cipher_name;
323 strncpy(cipher_name_dst, cipher_name_src, 336 strncpy(cipher_name_dst, cipher_name_src,
324 ECRYPTFS_MAX_CIPHER_NAME_SIZE); 337 ECRYPTFS_MAX_CIPHER_NAME_SIZE);
325 ecryptfs_printk(KERN_DEBUG, 338 cipher_name_dst[ECRYPTFS_MAX_CIPHER_NAME_SIZE] = '\0';
326 "The mount_crypt_stat "
327 "global_default_cipher_name set to: "
328 "[%s]\n", cipher_name_dst);
329 cipher_name_set = 1; 339 cipher_name_set = 1;
330 break; 340 break;
331 case ecryptfs_opt_ecryptfs_key_bytes: 341 case ecryptfs_opt_ecryptfs_key_bytes:
@@ -335,11 +345,6 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
335 &cipher_key_bytes_src, 0); 345 &cipher_key_bytes_src, 0);
336 mount_crypt_stat->global_default_cipher_key_size = 346 mount_crypt_stat->global_default_cipher_key_size =
337 cipher_key_bytes; 347 cipher_key_bytes;
338 ecryptfs_printk(KERN_DEBUG,
339 "The mount_crypt_stat "
340 "global_default_cipher_key_size "
341 "set to: [%d]\n", mount_crypt_stat->
342 global_default_cipher_key_size);
343 cipher_key_bytes_set = 1; 348 cipher_key_bytes_set = 1;
344 break; 349 break;
345 case ecryptfs_opt_passthrough: 350 case ecryptfs_opt_passthrough:
@@ -356,11 +361,51 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
356 mount_crypt_stat->flags |= 361 mount_crypt_stat->flags |=
357 ECRYPTFS_ENCRYPTED_VIEW_ENABLED; 362 ECRYPTFS_ENCRYPTED_VIEW_ENABLED;
358 break; 363 break;
364 case ecryptfs_opt_fnek_sig:
365 fnek_src = args[0].from;
366 fnek_dst =
367 mount_crypt_stat->global_default_fnek_sig;
368 strncpy(fnek_dst, fnek_src, ECRYPTFS_SIG_SIZE_HEX);
369 mount_crypt_stat->global_default_fnek_sig[
370 ECRYPTFS_SIG_SIZE_HEX] = '\0';
371 rc = ecryptfs_add_global_auth_tok(
372 mount_crypt_stat,
373 mount_crypt_stat->global_default_fnek_sig);
374 if (rc) {
375 printk(KERN_ERR "Error attempting to register "
376 "global fnek sig [%s]; rc = [%d]\n",
377 mount_crypt_stat->global_default_fnek_sig,
378 rc);
379 goto out;
380 }
381 mount_crypt_stat->flags |=
382 (ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES
383 | ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK);
384 break;
385 case ecryptfs_opt_fn_cipher:
386 fn_cipher_name_src = args[0].from;
387 fn_cipher_name_dst =
388 mount_crypt_stat->global_default_fn_cipher_name;
389 strncpy(fn_cipher_name_dst, fn_cipher_name_src,
390 ECRYPTFS_MAX_CIPHER_NAME_SIZE);
391 mount_crypt_stat->global_default_fn_cipher_name[
392 ECRYPTFS_MAX_CIPHER_NAME_SIZE] = '\0';
393 fn_cipher_name_set = 1;
394 break;
395 case ecryptfs_opt_fn_cipher_key_bytes:
396 fn_cipher_key_bytes_src = args[0].from;
397 fn_cipher_key_bytes =
398 (int)simple_strtol(fn_cipher_key_bytes_src,
399 &fn_cipher_key_bytes_src, 0);
400 mount_crypt_stat->global_default_fn_cipher_key_bytes =
401 fn_cipher_key_bytes;
402 fn_cipher_key_bytes_set = 1;
403 break;
359 case ecryptfs_opt_err: 404 case ecryptfs_opt_err:
360 default: 405 default:
361 ecryptfs_printk(KERN_WARNING, 406 printk(KERN_WARNING
362 "eCryptfs: unrecognized option '%s'\n", 407 "%s: eCryptfs: unrecognized option [%s]\n",
363 p); 408 __func__, p);
364 } 409 }
365 } 410 }
366 if (!sig_set) { 411 if (!sig_set) {
@@ -374,33 +419,60 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
374 int cipher_name_len = strlen(ECRYPTFS_DEFAULT_CIPHER); 419 int cipher_name_len = strlen(ECRYPTFS_DEFAULT_CIPHER);
375 420
376 BUG_ON(cipher_name_len >= ECRYPTFS_MAX_CIPHER_NAME_SIZE); 421 BUG_ON(cipher_name_len >= ECRYPTFS_MAX_CIPHER_NAME_SIZE);
377
378 strcpy(mount_crypt_stat->global_default_cipher_name, 422 strcpy(mount_crypt_stat->global_default_cipher_name,
379 ECRYPTFS_DEFAULT_CIPHER); 423 ECRYPTFS_DEFAULT_CIPHER);
380 } 424 }
381 if (!cipher_key_bytes_set) { 425 if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)
426 && !fn_cipher_name_set)
427 strcpy(mount_crypt_stat->global_default_fn_cipher_name,
428 mount_crypt_stat->global_default_cipher_name);
429 if (!cipher_key_bytes_set)
382 mount_crypt_stat->global_default_cipher_key_size = 0; 430 mount_crypt_stat->global_default_cipher_key_size = 0;
383 } 431 if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)
432 && !fn_cipher_key_bytes_set)
433 mount_crypt_stat->global_default_fn_cipher_key_bytes =
434 mount_crypt_stat->global_default_cipher_key_size;
384 mutex_lock(&key_tfm_list_mutex); 435 mutex_lock(&key_tfm_list_mutex);
385 if (!ecryptfs_tfm_exists(mount_crypt_stat->global_default_cipher_name, 436 if (!ecryptfs_tfm_exists(mount_crypt_stat->global_default_cipher_name,
386 NULL)) 437 NULL)) {
387 rc = ecryptfs_add_new_key_tfm( 438 rc = ecryptfs_add_new_key_tfm(
388 NULL, mount_crypt_stat->global_default_cipher_name, 439 NULL, mount_crypt_stat->global_default_cipher_name,
389 mount_crypt_stat->global_default_cipher_key_size); 440 mount_crypt_stat->global_default_cipher_key_size);
390 mutex_unlock(&key_tfm_list_mutex); 441 if (rc) {
391 if (rc) { 442 printk(KERN_ERR "Error attempting to initialize "
392 printk(KERN_ERR "Error attempting to initialize cipher with " 443 "cipher with name = [%s] and key size = [%td]; "
393 "name = [%s] and key size = [%td]; rc = [%d]\n", 444 "rc = [%d]\n",
394 mount_crypt_stat->global_default_cipher_name, 445 mount_crypt_stat->global_default_cipher_name,
395 mount_crypt_stat->global_default_cipher_key_size, rc); 446 mount_crypt_stat->global_default_cipher_key_size,
396 rc = -EINVAL; 447 rc);
397 goto out; 448 rc = -EINVAL;
449 mutex_unlock(&key_tfm_list_mutex);
450 goto out;
451 }
398 } 452 }
453 if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)
454 && !ecryptfs_tfm_exists(
455 mount_crypt_stat->global_default_fn_cipher_name, NULL)) {
456 rc = ecryptfs_add_new_key_tfm(
457 NULL, mount_crypt_stat->global_default_fn_cipher_name,
458 mount_crypt_stat->global_default_fn_cipher_key_bytes);
459 if (rc) {
460 printk(KERN_ERR "Error attempting to initialize "
461 "cipher with name = [%s] and key size = [%td]; "
462 "rc = [%d]\n",
463 mount_crypt_stat->global_default_fn_cipher_name,
464 mount_crypt_stat->global_default_fn_cipher_key_bytes,
465 rc);
466 rc = -EINVAL;
467 mutex_unlock(&key_tfm_list_mutex);
468 goto out;
469 }
470 }
471 mutex_unlock(&key_tfm_list_mutex);
399 rc = ecryptfs_init_global_auth_toks(mount_crypt_stat); 472 rc = ecryptfs_init_global_auth_toks(mount_crypt_stat);
400 if (rc) { 473 if (rc)
401 printk(KERN_WARNING "One or more global auth toks could not " 474 printk(KERN_WARNING "One or more global auth toks could not "
402 "properly register; rc = [%d]\n", rc); 475 "properly register; rc = [%d]\n", rc);
403 }
404out: 476out:
405 return rc; 477 return rc;
406} 478}
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index 6913f727624d..96ef51489e01 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -193,7 +193,7 @@ ecryptfs_spawn_daemon(struct ecryptfs_daemon **daemon, uid_t euid,
193 (*daemon) = kzalloc(sizeof(**daemon), GFP_KERNEL); 193 (*daemon) = kzalloc(sizeof(**daemon), GFP_KERNEL);
194 if (!(*daemon)) { 194 if (!(*daemon)) {
195 rc = -ENOMEM; 195 rc = -ENOMEM;
196 printk(KERN_ERR "%s: Failed to allocate [%Zd] bytes of " 196 printk(KERN_ERR "%s: Failed to allocate [%zd] bytes of "
197 "GFP_KERNEL memory\n", __func__, sizeof(**daemon)); 197 "GFP_KERNEL memory\n", __func__, sizeof(**daemon));
198 goto out; 198 goto out;
199 } 199 }
@@ -435,7 +435,7 @@ int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t euid,
435 msg_ctx->msg = kmalloc(msg_size, GFP_KERNEL); 435 msg_ctx->msg = kmalloc(msg_size, GFP_KERNEL);
436 if (!msg_ctx->msg) { 436 if (!msg_ctx->msg) {
437 rc = -ENOMEM; 437 rc = -ENOMEM;
438 printk(KERN_ERR "%s: Failed to allocate [%Zd] bytes of " 438 printk(KERN_ERR "%s: Failed to allocate [%zd] bytes of "
439 "GFP_KERNEL memory\n", __func__, msg_size); 439 "GFP_KERNEL memory\n", __func__, msg_size);
440 goto unlock; 440 goto unlock;
441 } 441 }
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index efd95a0ed1ea..a67fea655f49 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -199,7 +199,7 @@ int ecryptfs_send_miscdev(char *data, size_t data_size,
199 if (!msg_ctx->msg) { 199 if (!msg_ctx->msg) {
200 rc = -ENOMEM; 200 rc = -ENOMEM;
201 printk(KERN_ERR "%s: Out of memory whilst attempting " 201 printk(KERN_ERR "%s: Out of memory whilst attempting "
202 "to kmalloc(%Zd, GFP_KERNEL)\n", __func__, 202 "to kmalloc(%zd, GFP_KERNEL)\n", __func__,
203 (sizeof(*msg_ctx->msg) + data_size)); 203 (sizeof(*msg_ctx->msg) + data_size));
204 goto out_unlock; 204 goto out_unlock;
205 } 205 }
@@ -322,7 +322,7 @@ check_list:
322 if (count < total_length) { 322 if (count < total_length) {
323 rc = 0; 323 rc = 0;
324 printk(KERN_WARNING "%s: Only given user buffer of " 324 printk(KERN_WARNING "%s: Only given user buffer of "
325 "size [%Zd], but we need [%Zd] to read the " 325 "size [%zd], but we need [%zd] to read the "
326 "pending message\n", __func__, count, total_length); 326 "pending message\n", __func__, count, total_length);
327 goto out_unlock_msg_ctx; 327 goto out_unlock_msg_ctx;
328 } 328 }
@@ -376,7 +376,7 @@ static int ecryptfs_miscdev_response(char *data, size_t data_size,
376 376
377 if ((sizeof(*msg) + msg->data_len) != data_size) { 377 if ((sizeof(*msg) + msg->data_len) != data_size) {
378 printk(KERN_WARNING "%s: (sizeof(*msg) + msg->data_len) = " 378 printk(KERN_WARNING "%s: (sizeof(*msg) + msg->data_len) = "
379 "[%Zd]; data_size = [%Zd]. Invalid packet.\n", __func__, 379 "[%zd]; data_size = [%zd]. Invalid packet.\n", __func__,
380 (sizeof(*msg) + msg->data_len), data_size); 380 (sizeof(*msg) + msg->data_len), data_size);
381 rc = -EINVAL; 381 rc = -EINVAL;
382 goto out; 382 goto out;
@@ -421,7 +421,7 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
421 data = kmalloc(count, GFP_KERNEL); 421 data = kmalloc(count, GFP_KERNEL);
422 if (!data) { 422 if (!data) {
423 printk(KERN_ERR "%s: Out of memory whilst attempting to " 423 printk(KERN_ERR "%s: Out of memory whilst attempting to "
424 "kmalloc([%Zd], GFP_KERNEL)\n", __func__, count); 424 "kmalloc([%zd], GFP_KERNEL)\n", __func__, count);
425 goto out; 425 goto out;
426 } 426 }
427 rc = copy_from_user(data, buf, count); 427 rc = copy_from_user(data, buf, count);
@@ -436,8 +436,8 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
436 case ECRYPTFS_MSG_RESPONSE: 436 case ECRYPTFS_MSG_RESPONSE:
437 if (count < (1 + 4 + 1 + sizeof(struct ecryptfs_message))) { 437 if (count < (1 + 4 + 1 + sizeof(struct ecryptfs_message))) {
438 printk(KERN_WARNING "%s: Minimum acceptable packet " 438 printk(KERN_WARNING "%s: Minimum acceptable packet "
439 "size is [%Zd], but amount of data written is " 439 "size is [%zd], but amount of data written is "
440 "only [%Zd]. Discarding response packet.\n", 440 "only [%zd]. Discarding response packet.\n",
441 __func__, 441 __func__,
442 (1 + 4 + 1 + sizeof(struct ecryptfs_message)), 442 (1 + 4 + 1 + sizeof(struct ecryptfs_message)),
443 count); 443 count);
@@ -455,9 +455,9 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
455 } 455 }
456 i += packet_size_length; 456 i += packet_size_length;
457 if ((1 + 4 + packet_size_length + packet_size) != count) { 457 if ((1 + 4 + packet_size_length + packet_size) != count) {
458 printk(KERN_WARNING "%s: (1 + packet_size_length([%Zd])" 458 printk(KERN_WARNING "%s: (1 + packet_size_length([%zd])"
459 " + packet_size([%Zd]))([%Zd]) != " 459 " + packet_size([%zd]))([%zd]) != "
460 "count([%Zd]). Invalid packet format.\n", 460 "count([%zd]). Invalid packet format.\n",
461 __func__, packet_size_length, packet_size, 461 __func__, packet_size_length, packet_size,
462 (1 + packet_size_length + packet_size), count); 462 (1 + packet_size_length + packet_size), count);
463 goto out_free; 463 goto out_free;
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 04d7b3fa1ac6..46cec2b69796 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -288,7 +288,7 @@ static int ecryptfs_write_begin(struct file *file,
288 loff_t prev_page_end_size; 288 loff_t prev_page_end_size;
289 int rc = 0; 289 int rc = 0;
290 290
291 page = __grab_cache_page(mapping, index); 291 page = grab_cache_page_write_begin(mapping, index, flags);
292 if (!page) 292 if (!page)
293 return -ENOMEM; 293 return -ENOMEM;
294 *pagep = page; 294 *pagep = page;
diff --git a/fs/efs/Kconfig b/fs/efs/Kconfig
new file mode 100644
index 000000000000..6ebfc1c207a8
--- /dev/null
+++ b/fs/efs/Kconfig
@@ -0,0 +1,14 @@
1config EFS_FS
2 tristate "EFS file system support (read only) (EXPERIMENTAL)"
3 depends on BLOCK && EXPERIMENTAL
4 help
5 EFS is an older file system used for non-ISO9660 CD-ROMs and hard
6 disk partitions by SGI's IRIX operating system (IRIX 6.0 and newer
7 uses the XFS file system for hard disk partitions however).
8
9 This implementation only offers read-only access. If you don't know
10 what all this is about, it's safe to say N. For more information
11 about EFS see its home page at <http://aeschi.ch.eu.org/efs/>.
12
13 To compile the EFS file system support as a module, choose M here: the
14 module will be called efs.
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 08bf558d0408..5de2c2db3aa2 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -198,7 +198,7 @@ struct file *eventfd_fget(int fd)
198 return file; 198 return file;
199} 199}
200 200
201asmlinkage long sys_eventfd2(unsigned int count, int flags) 201SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
202{ 202{
203 int fd; 203 int fd;
204 struct eventfd_ctx *ctx; 204 struct eventfd_ctx *ctx;
@@ -228,8 +228,7 @@ asmlinkage long sys_eventfd2(unsigned int count, int flags)
228 return fd; 228 return fd;
229} 229}
230 230
231asmlinkage long sys_eventfd(unsigned int count) 231SYSCALL_DEFINE1(eventfd, unsigned int, count)
232{ 232{
233 return sys_eventfd2(count, 0); 233 return sys_eventfd2(count, 0);
234} 234}
235
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 96355d505347..011b9b8c90c6 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -234,8 +234,6 @@ struct ep_pqueue {
234/* 234/*
235 * Configuration options available inside /proc/sys/fs/epoll/ 235 * Configuration options available inside /proc/sys/fs/epoll/
236 */ 236 */
237/* Maximum number of epoll devices, per user */
238static int max_user_instances __read_mostly;
239/* Maximum number of epoll watched descriptors, per user */ 237/* Maximum number of epoll watched descriptors, per user */
240static int max_user_watches __read_mostly; 238static int max_user_watches __read_mostly;
241 239
@@ -261,14 +259,6 @@ static int zero;
261 259
262ctl_table epoll_table[] = { 260ctl_table epoll_table[] = {
263 { 261 {
264 .procname = "max_user_instances",
265 .data = &max_user_instances,
266 .maxlen = sizeof(int),
267 .mode = 0644,
268 .proc_handler = &proc_dointvec_minmax,
269 .extra1 = &zero,
270 },
271 {
272 .procname = "max_user_watches", 262 .procname = "max_user_watches",
273 .data = &max_user_watches, 263 .data = &max_user_watches,
274 .maxlen = sizeof(int), 264 .maxlen = sizeof(int),
@@ -491,7 +481,6 @@ static void ep_free(struct eventpoll *ep)
491 481
492 mutex_unlock(&epmutex); 482 mutex_unlock(&epmutex);
493 mutex_destroy(&ep->mtx); 483 mutex_destroy(&ep->mtx);
494 atomic_dec(&ep->user->epoll_devs);
495 free_uid(ep->user); 484 free_uid(ep->user);
496 kfree(ep); 485 kfree(ep);
497} 486}
@@ -581,10 +570,6 @@ static int ep_alloc(struct eventpoll **pep)
581 struct eventpoll *ep; 570 struct eventpoll *ep;
582 571
583 user = get_current_user(); 572 user = get_current_user();
584 error = -EMFILE;
585 if (unlikely(atomic_read(&user->epoll_devs) >=
586 max_user_instances))
587 goto free_uid;
588 error = -ENOMEM; 573 error = -ENOMEM;
589 ep = kzalloc(sizeof(*ep), GFP_KERNEL); 574 ep = kzalloc(sizeof(*ep), GFP_KERNEL);
590 if (unlikely(!ep)) 575 if (unlikely(!ep))
@@ -1110,7 +1095,7 @@ retry:
1110/* 1095/*
1111 * Open an eventpoll file descriptor. 1096 * Open an eventpoll file descriptor.
1112 */ 1097 */
1113asmlinkage long sys_epoll_create1(int flags) 1098SYSCALL_DEFINE1(epoll_create1, int, flags)
1114{ 1099{
1115 int error, fd = -1; 1100 int error, fd = -1;
1116 struct eventpoll *ep; 1101 struct eventpoll *ep;
@@ -1141,7 +1126,6 @@ asmlinkage long sys_epoll_create1(int flags)
1141 flags & O_CLOEXEC); 1126 flags & O_CLOEXEC);
1142 if (fd < 0) 1127 if (fd < 0)
1143 ep_free(ep); 1128 ep_free(ep);
1144 atomic_inc(&ep->user->epoll_devs);
1145 1129
1146error_return: 1130error_return:
1147 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n", 1131 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
@@ -1150,7 +1134,7 @@ error_return:
1150 return fd; 1134 return fd;
1151} 1135}
1152 1136
1153asmlinkage long sys_epoll_create(int size) 1137SYSCALL_DEFINE1(epoll_create, int, size)
1154{ 1138{
1155 if (size < 0) 1139 if (size < 0)
1156 return -EINVAL; 1140 return -EINVAL;
@@ -1163,8 +1147,8 @@ asmlinkage long sys_epoll_create(int size)
1163 * the eventpoll file that enables the insertion/removal/change of 1147 * the eventpoll file that enables the insertion/removal/change of
1164 * file descriptors inside the interest set. 1148 * file descriptors inside the interest set.
1165 */ 1149 */
1166asmlinkage long sys_epoll_ctl(int epfd, int op, int fd, 1150SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
1167 struct epoll_event __user *event) 1151 struct epoll_event __user *, event)
1168{ 1152{
1169 int error; 1153 int error;
1170 struct file *file, *tfile; 1154 struct file *file, *tfile;
@@ -1261,8 +1245,8 @@ error_return:
1261 * Implement the event wait interface for the eventpoll file. It is the kernel 1245 * Implement the event wait interface for the eventpoll file. It is the kernel
1262 * part of the user space epoll_wait(2). 1246 * part of the user space epoll_wait(2).
1263 */ 1247 */
1264asmlinkage long sys_epoll_wait(int epfd, struct epoll_event __user *events, 1248SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
1265 int maxevents, int timeout) 1249 int, maxevents, int, timeout)
1266{ 1250{
1267 int error; 1251 int error;
1268 struct file *file; 1252 struct file *file;
@@ -1319,9 +1303,9 @@ error_return:
1319 * Implement the event wait interface for the eventpoll file. It is the kernel 1303 * Implement the event wait interface for the eventpoll file. It is the kernel
1320 * part of the user space epoll_pwait(2). 1304 * part of the user space epoll_pwait(2).
1321 */ 1305 */
1322asmlinkage long sys_epoll_pwait(int epfd, struct epoll_event __user *events, 1306SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
1323 int maxevents, int timeout, const sigset_t __user *sigmask, 1307 int, maxevents, int, timeout, const sigset_t __user *, sigmask,
1324 size_t sigsetsize) 1308 size_t, sigsetsize)
1325{ 1309{
1326 int error; 1310 int error;
1327 sigset_t ksigmask, sigsaved; 1311 sigset_t ksigmask, sigsaved;
@@ -1366,8 +1350,10 @@ static int __init eventpoll_init(void)
1366 struct sysinfo si; 1350 struct sysinfo si;
1367 1351
1368 si_meminfo(&si); 1352 si_meminfo(&si);
1369 max_user_instances = 128; 1353 /*
1370 max_user_watches = (((si.totalram - si.totalhigh) / 32) << PAGE_SHIFT) / 1354 * Allows top 4% of lomem to be allocated for epoll watches (per user).
1355 */
1356 max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) /
1371 EP_ITEM_COST; 1357 EP_ITEM_COST;
1372 1358
1373 /* Initialize the structure used to perform safe poll wait head wake ups */ 1359 /* Initialize the structure used to perform safe poll wait head wake ups */
diff --git a/fs/exec.c b/fs/exec.c
index 9c789a525cc4..febfd8ed6ad1 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -52,17 +52,13 @@
52#include <linux/audit.h> 52#include <linux/audit.h>
53#include <linux/tracehook.h> 53#include <linux/tracehook.h>
54#include <linux/kmod.h> 54#include <linux/kmod.h>
55#include <linux/fsnotify.h>
55 56
56#include <asm/uaccess.h> 57#include <asm/uaccess.h>
57#include <asm/mmu_context.h> 58#include <asm/mmu_context.h>
58#include <asm/tlb.h> 59#include <asm/tlb.h>
59#include "internal.h" 60#include "internal.h"
60 61
61#ifdef __alpha__
62/* for /sbin/loader handling in search_binary_handler() */
63#include <linux/a.out.h>
64#endif
65
66int core_uses_pid; 62int core_uses_pid;
67char core_pattern[CORENAME_MAX_SIZE] = "core"; 63char core_pattern[CORENAME_MAX_SIZE] = "core";
68int suid_dumpable = 0; 64int suid_dumpable = 0;
@@ -104,7 +100,7 @@ static inline void put_binfmt(struct linux_binfmt * fmt)
104 * 100 *
105 * Also note that we take the address to load from from the file itself. 101 * Also note that we take the address to load from from the file itself.
106 */ 102 */
107asmlinkage long sys_uselib(const char __user * library) 103SYSCALL_DEFINE1(uselib, const char __user *, library)
108{ 104{
109 struct file *file; 105 struct file *file;
110 struct nameidata nd; 106 struct nameidata nd;
@@ -128,7 +124,8 @@ asmlinkage long sys_uselib(const char __user * library)
128 if (nd.path.mnt->mnt_flags & MNT_NOEXEC) 124 if (nd.path.mnt->mnt_flags & MNT_NOEXEC)
129 goto exit; 125 goto exit;
130 126
131 error = vfs_permission(&nd, MAY_READ | MAY_EXEC | MAY_OPEN); 127 error = inode_permission(nd.path.dentry->d_inode,
128 MAY_READ | MAY_EXEC | MAY_OPEN);
132 if (error) 129 if (error)
133 goto exit; 130 goto exit;
134 error = ima_path_check(&nd.path, MAY_READ | MAY_EXEC | MAY_OPEN); 131 error = ima_path_check(&nd.path, MAY_READ | MAY_EXEC | MAY_OPEN);
@@ -140,6 +137,8 @@ asmlinkage long sys_uselib(const char __user * library)
140 if (IS_ERR(file)) 137 if (IS_ERR(file))
141 goto out; 138 goto out;
142 139
140 fsnotify_open(file->f_path.dentry);
141
143 error = -ENOEXEC; 142 error = -ENOEXEC;
144 if(file->f_op) { 143 if(file->f_op) {
145 struct linux_binfmt * fmt; 144 struct linux_binfmt * fmt;
@@ -237,13 +236,13 @@ static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
237 236
238static int __bprm_mm_init(struct linux_binprm *bprm) 237static int __bprm_mm_init(struct linux_binprm *bprm)
239{ 238{
240 int err = -ENOMEM; 239 int err;
241 struct vm_area_struct *vma = NULL; 240 struct vm_area_struct *vma = NULL;
242 struct mm_struct *mm = bprm->mm; 241 struct mm_struct *mm = bprm->mm;
243 242
244 bprm->vma = vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); 243 bprm->vma = vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
245 if (!vma) 244 if (!vma)
246 goto err; 245 return -ENOMEM;
247 246
248 down_write(&mm->mmap_sem); 247 down_write(&mm->mmap_sem);
249 vma->vm_mm = mm; 248 vma->vm_mm = mm;
@@ -256,28 +255,20 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
256 */ 255 */
257 vma->vm_end = STACK_TOP_MAX; 256 vma->vm_end = STACK_TOP_MAX;
258 vma->vm_start = vma->vm_end - PAGE_SIZE; 257 vma->vm_start = vma->vm_end - PAGE_SIZE;
259
260 vma->vm_flags = VM_STACK_FLAGS; 258 vma->vm_flags = VM_STACK_FLAGS;
261 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); 259 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
262 err = insert_vm_struct(mm, vma); 260 err = insert_vm_struct(mm, vma);
263 if (err) { 261 if (err)
264 up_write(&mm->mmap_sem);
265 goto err; 262 goto err;
266 }
267 263
268 mm->stack_vm = mm->total_vm = 1; 264 mm->stack_vm = mm->total_vm = 1;
269 up_write(&mm->mmap_sem); 265 up_write(&mm->mmap_sem);
270
271 bprm->p = vma->vm_end - sizeof(void *); 266 bprm->p = vma->vm_end - sizeof(void *);
272
273 return 0; 267 return 0;
274
275err: 268err:
276 if (vma) { 269 up_write(&mm->mmap_sem);
277 bprm->vma = NULL; 270 bprm->vma = NULL;
278 kmem_cache_free(vm_area_cachep, vma); 271 kmem_cache_free(vm_area_cachep, vma);
279 }
280
281 return err; 272 return err;
282} 273}
283 274
@@ -684,7 +675,7 @@ struct file *open_exec(const char *name)
684 if (nd.path.mnt->mnt_flags & MNT_NOEXEC) 675 if (nd.path.mnt->mnt_flags & MNT_NOEXEC)
685 goto out_path_put; 676 goto out_path_put;
686 677
687 err = vfs_permission(&nd, MAY_EXEC | MAY_OPEN); 678 err = inode_permission(nd.path.dentry->d_inode, MAY_EXEC | MAY_OPEN);
688 if (err) 679 if (err)
689 goto out_path_put; 680 goto out_path_put;
690 err = ima_path_check(&nd.path, MAY_EXEC | MAY_OPEN); 681 err = ima_path_check(&nd.path, MAY_EXEC | MAY_OPEN);
@@ -695,6 +686,8 @@ struct file *open_exec(const char *name)
695 if (IS_ERR(file)) 686 if (IS_ERR(file))
696 return file; 687 return file;
697 688
689 fsnotify_open(file->f_path.dentry);
690
698 err = deny_write_access(file); 691 err = deny_write_access(file);
699 if (err) { 692 if (err) {
700 fput(file); 693 fput(file);
@@ -1178,41 +1171,7 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
1178 unsigned int depth = bprm->recursion_depth; 1171 unsigned int depth = bprm->recursion_depth;
1179 int try,retval; 1172 int try,retval;
1180 struct linux_binfmt *fmt; 1173 struct linux_binfmt *fmt;
1181#ifdef __alpha__
1182 /* handle /sbin/loader.. */
1183 {
1184 struct exec * eh = (struct exec *) bprm->buf;
1185
1186 if (!bprm->loader && eh->fh.f_magic == 0x183 &&
1187 (eh->fh.f_flags & 0x3000) == 0x3000)
1188 {
1189 struct file * file;
1190 unsigned long loader;
1191 1174
1192 allow_write_access(bprm->file);
1193 fput(bprm->file);
1194 bprm->file = NULL;
1195
1196 loader = bprm->vma->vm_end - sizeof(void *);
1197
1198 file = open_exec("/sbin/loader");
1199 retval = PTR_ERR(file);
1200 if (IS_ERR(file))
1201 return retval;
1202
1203 /* Remember if the application is TASO. */
1204 bprm->taso = eh->ah.entry < 0x100000000UL;
1205
1206 bprm->file = file;
1207 bprm->loader = loader;
1208 retval = prepare_binprm(bprm);
1209 if (retval<0)
1210 return retval;
1211 /* should call search_binary_handler recursively here,
1212 but it does not matter */
1213 }
1214 }
1215#endif
1216 retval = security_bprm_check(bprm); 1175 retval = security_bprm_check(bprm);
1217 if (retval) 1176 if (retval)
1218 return retval; 1177 return retval;
@@ -1737,7 +1696,7 @@ int get_dumpable(struct mm_struct *mm)
1737 return (ret >= 2) ? 2 : ret; 1696 return (ret >= 2) ? 2 : ret;
1738} 1697}
1739 1698
1740int do_coredump(long signr, int exit_code, struct pt_regs * regs) 1699void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1741{ 1700{
1742 struct core_state core_state; 1701 struct core_state core_state;
1743 char corename[CORENAME_MAX_SIZE + 1]; 1702 char corename[CORENAME_MAX_SIZE + 1];
@@ -1821,6 +1780,11 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
1821 1780
1822 if (ispipe) { 1781 if (ispipe) {
1823 helper_argv = argv_split(GFP_KERNEL, corename+1, &helper_argc); 1782 helper_argv = argv_split(GFP_KERNEL, corename+1, &helper_argc);
1783 if (!helper_argv) {
1784 printk(KERN_WARNING "%s failed to allocate memory\n",
1785 __func__);
1786 goto fail_unlock;
1787 }
1824 /* Terminate the string before the first option */ 1788 /* Terminate the string before the first option */
1825 delimit = strchr(corename, ' '); 1789 delimit = strchr(corename, ' ');
1826 if (delimit) 1790 if (delimit)
@@ -1888,5 +1852,5 @@ fail_unlock:
1888 put_cred(cred); 1852 put_cred(cred);
1889 coredump_finish(mm); 1853 coredump_finish(mm);
1890fail: 1854fail:
1891 return retval; 1855 return;
1892} 1856}
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 9a0fc400f91c..2999d72153b7 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -95,10 +95,13 @@ static int ext2_commit_chunk(struct page *page, loff_t pos, unsigned len)
95 mark_inode_dirty(dir); 95 mark_inode_dirty(dir);
96 } 96 }
97 97
98 if (IS_DIRSYNC(dir)) 98 if (IS_DIRSYNC(dir)) {
99 err = write_one_page(page, 1); 99 err = write_one_page(page, 1);
100 else 100 if (!err)
101 err = ext2_sync_inode(dir);
102 } else {
101 unlock_page(page); 103 unlock_page(page);
104 }
102 105
103 return err; 106 return err;
104} 107}
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index 8d0add625870..66321a877e74 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -565,12 +565,8 @@ got:
565 inode->i_blocks = 0; 565 inode->i_blocks = 0;
566 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; 566 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
567 memset(ei->i_data, 0, sizeof(ei->i_data)); 567 memset(ei->i_data, 0, sizeof(ei->i_data));
568 ei->i_flags = EXT2_I(dir)->i_flags & ~EXT2_BTREE_FL; 568 ei->i_flags =
569 if (S_ISLNK(mode)) 569 ext2_mask_flags(mode, EXT2_I(dir)->i_flags & EXT2_FL_INHERITED);
570 ei->i_flags &= ~(EXT2_IMMUTABLE_FL|EXT2_APPEND_FL);
571 /* dirsync is only applied to directories */
572 if (!S_ISDIR(mode))
573 ei->i_flags &= ~EXT2_DIRSYNC_FL;
574 ei->i_faddr = 0; 570 ei->i_faddr = 0;
575 ei->i_frag_no = 0; 571 ei->i_frag_no = 0;
576 ei->i_frag_size = 0; 572 ei->i_frag_size = 0;
@@ -585,7 +581,10 @@ got:
585 spin_lock(&sbi->s_next_gen_lock); 581 spin_lock(&sbi->s_next_gen_lock);
586 inode->i_generation = sbi->s_next_generation++; 582 inode->i_generation = sbi->s_next_generation++;
587 spin_unlock(&sbi->s_next_gen_lock); 583 spin_unlock(&sbi->s_next_gen_lock);
588 insert_inode_hash(inode); 584 if (insert_inode_locked(inode) < 0) {
585 err = -EINVAL;
586 goto fail_drop;
587 }
589 588
590 if (DQUOT_ALLOC_INODE(inode)) { 589 if (DQUOT_ALLOC_INODE(inode)) {
591 err = -EDQUOT; 590 err = -EDQUOT;
@@ -612,6 +611,7 @@ fail_drop:
612 DQUOT_DROP(inode); 611 DQUOT_DROP(inode);
613 inode->i_flags |= S_NOQUOTA; 612 inode->i_flags |= S_NOQUOTA;
614 inode->i_nlink = 0; 613 inode->i_nlink = 0;
614 unlock_new_inode(inode);
615 iput(inode); 615 iput(inode);
616 return ERR_PTR(err); 616 return ERR_PTR(err);
617 617
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 7658b33e2653..23fff2f87783 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -32,6 +32,7 @@
32#include <linux/buffer_head.h> 32#include <linux/buffer_head.h>
33#include <linux/mpage.h> 33#include <linux/mpage.h>
34#include <linux/fiemap.h> 34#include <linux/fiemap.h>
35#include <linux/namei.h>
35#include "ext2.h" 36#include "ext2.h"
36#include "acl.h" 37#include "acl.h"
37#include "xip.h" 38#include "xip.h"
@@ -497,8 +498,6 @@ static int ext2_alloc_branch(struct inode *inode,
497 * ext2_splice_branch - splice the allocated branch onto inode. 498 * ext2_splice_branch - splice the allocated branch onto inode.
498 * @inode: owner 499 * @inode: owner
499 * @block: (logical) number of block we are adding 500 * @block: (logical) number of block we are adding
500 * @chain: chain of indirect blocks (with a missing link - see
501 * ext2_alloc_branch)
502 * @where: location of missing link 501 * @where: location of missing link
503 * @num: number of indirect blocks we are adding 502 * @num: number of indirect blocks we are adding
504 * @blks: number of direct blocks we are adding 503 * @blks: number of direct blocks we are adding
@@ -1286,9 +1285,11 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
1286 else 1285 else
1287 inode->i_mapping->a_ops = &ext2_aops; 1286 inode->i_mapping->a_ops = &ext2_aops;
1288 } else if (S_ISLNK(inode->i_mode)) { 1287 } else if (S_ISLNK(inode->i_mode)) {
1289 if (ext2_inode_is_fast_symlink(inode)) 1288 if (ext2_inode_is_fast_symlink(inode)) {
1290 inode->i_op = &ext2_fast_symlink_inode_operations; 1289 inode->i_op = &ext2_fast_symlink_inode_operations;
1291 else { 1290 nd_terminate_link(ei->i_data, inode->i_size,
1291 sizeof(ei->i_data) - 1);
1292 } else {
1292 inode->i_op = &ext2_symlink_inode_operations; 1293 inode->i_op = &ext2_symlink_inode_operations;
1293 if (test_opt(inode->i_sb, NOBH)) 1294 if (test_opt(inode->i_sb, NOBH))
1294 inode->i_mapping->a_ops = &ext2_nobh_aops; 1295 inode->i_mapping->a_ops = &ext2_nobh_aops;
diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index de876fa793e1..7cb4badef927 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -50,8 +50,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
50 goto setflags_out; 50 goto setflags_out;
51 } 51 }
52 52
53 if (!S_ISDIR(inode->i_mode)) 53 flags = ext2_mask_flags(inode->i_mode, flags);
54 flags &= ~EXT2_DIRSYNC_FL;
55 54
56 mutex_lock(&inode->i_mutex); 55 mutex_lock(&inode->i_mutex);
57 /* Is it quota file? Do not allow user to mess with it */ 56 /* Is it quota file? Do not allow user to mess with it */
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 2a747252ec12..90ea17998a73 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -41,9 +41,11 @@ static inline int ext2_add_nondir(struct dentry *dentry, struct inode *inode)
41 int err = ext2_add_link(dentry, inode); 41 int err = ext2_add_link(dentry, inode);
42 if (!err) { 42 if (!err) {
43 d_instantiate(dentry, inode); 43 d_instantiate(dentry, inode);
44 unlock_new_inode(inode);
44 return 0; 45 return 0;
45 } 46 }
46 inode_dec_link_count(inode); 47 inode_dec_link_count(inode);
48 unlock_new_inode(inode);
47 iput(inode); 49 iput(inode);
48 return err; 50 return err;
49} 51}
@@ -170,6 +172,7 @@ out:
170 172
171out_fail: 173out_fail:
172 inode_dec_link_count(inode); 174 inode_dec_link_count(inode);
175 unlock_new_inode(inode);
173 iput (inode); 176 iput (inode);
174 goto out; 177 goto out;
175} 178}
@@ -178,6 +181,7 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir,
178 struct dentry *dentry) 181 struct dentry *dentry)
179{ 182{
180 struct inode *inode = old_dentry->d_inode; 183 struct inode *inode = old_dentry->d_inode;
184 int err;
181 185
182 if (inode->i_nlink >= EXT2_LINK_MAX) 186 if (inode->i_nlink >= EXT2_LINK_MAX)
183 return -EMLINK; 187 return -EMLINK;
@@ -186,7 +190,14 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir,
186 inode_inc_link_count(inode); 190 inode_inc_link_count(inode);
187 atomic_inc(&inode->i_count); 191 atomic_inc(&inode->i_count);
188 192
189 return ext2_add_nondir(dentry, inode); 193 err = ext2_add_link(dentry, inode);
194 if (!err) {
195 d_instantiate(dentry, inode);
196 return 0;
197 }
198 inode_dec_link_count(inode);
199 iput(inode);
200 return err;
190} 201}
191 202
192static int ext2_mkdir(struct inode * dir, struct dentry * dentry, int mode) 203static int ext2_mkdir(struct inode * dir, struct dentry * dentry, int mode)
@@ -222,12 +233,14 @@ static int ext2_mkdir(struct inode * dir, struct dentry * dentry, int mode)
222 goto out_fail; 233 goto out_fail;
223 234
224 d_instantiate(dentry, inode); 235 d_instantiate(dentry, inode);
236 unlock_new_inode(inode);
225out: 237out:
226 return err; 238 return err;
227 239
228out_fail: 240out_fail:
229 inode_dec_link_count(inode); 241 inode_dec_link_count(inode);
230 inode_dec_link_count(inode); 242 inode_dec_link_count(inode);
243 unlock_new_inode(inode);
231 iput(inode); 244 iput(inode);
232out_dir: 245out_dir:
233 inode_dec_link_count(dir); 246 inode_dec_link_count(dir);
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 647cd888ac87..da8bdeaa2e6d 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -132,6 +132,7 @@ static void ext2_put_super (struct super_block * sb)
132 percpu_counter_destroy(&sbi->s_dirs_counter); 132 percpu_counter_destroy(&sbi->s_dirs_counter);
133 brelse (sbi->s_sbh); 133 brelse (sbi->s_sbh);
134 sb->s_fs_info = NULL; 134 sb->s_fs_info = NULL;
135 kfree(sbi->s_blockgroup_lock);
135 kfree(sbi); 136 kfree(sbi);
136 137
137 return; 138 return;
@@ -756,6 +757,13 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
756 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); 757 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
757 if (!sbi) 758 if (!sbi)
758 return -ENOMEM; 759 return -ENOMEM;
760
761 sbi->s_blockgroup_lock =
762 kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
763 if (!sbi->s_blockgroup_lock) {
764 kfree(sbi);
765 return -ENOMEM;
766 }
759 sb->s_fs_info = sbi; 767 sb->s_fs_info = sbi;
760 sbi->s_sb_block = sb_block; 768 sbi->s_sb_block = sb_block;
761 769
@@ -983,7 +991,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
983 printk ("EXT2-fs: not enough memory\n"); 991 printk ("EXT2-fs: not enough memory\n");
984 goto failed_mount; 992 goto failed_mount;
985 } 993 }
986 bgl_lock_init(&sbi->s_blockgroup_lock); 994 bgl_lock_init(sbi->s_blockgroup_lock);
987 sbi->s_debts = kcalloc(sbi->s_groups_count, sizeof(*sbi->s_debts), GFP_KERNEL); 995 sbi->s_debts = kcalloc(sbi->s_groups_count, sizeof(*sbi->s_debts), GFP_KERNEL);
988 if (!sbi->s_debts) { 996 if (!sbi->s_debts) {
989 printk ("EXT2-fs: not enough memory\n"); 997 printk ("EXT2-fs: not enough memory\n");
diff --git a/fs/ext3/hash.c b/fs/ext3/hash.c
index c30e149fbd2e..7d215b4d4f2e 100644
--- a/fs/ext3/hash.c
+++ b/fs/ext3/hash.c
@@ -35,23 +35,71 @@ static void TEA_transform(__u32 buf[4], __u32 const in[])
35 35
36 36
37/* The old legacy hash */ 37/* The old legacy hash */
38static __u32 dx_hack_hash (const char *name, int len) 38static __u32 dx_hack_hash_unsigned(const char *name, int len)
39{ 39{
40 __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; 40 __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
41 const unsigned char *ucp = (const unsigned char *) name;
42
43 while (len--) {
44 hash = hash1 + (hash0 ^ (((int) *ucp++) * 7152373));
45
46 if (hash & 0x80000000)
47 hash -= 0x7fffffff;
48 hash1 = hash0;
49 hash0 = hash;
50 }
51 return hash0 << 1;
52}
53
54static __u32 dx_hack_hash_signed(const char *name, int len)
55{
56 __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
57 const signed char *scp = (const signed char *) name;
58
41 while (len--) { 59 while (len--) {
42 __u32 hash = hash1 + (hash0 ^ (*name++ * 7152373)); 60 hash = hash1 + (hash0 ^ (((int) *scp++) * 7152373));
43 61
44 if (hash & 0x80000000) hash -= 0x7fffffff; 62 if (hash & 0x80000000)
63 hash -= 0x7fffffff;
45 hash1 = hash0; 64 hash1 = hash0;
46 hash0 = hash; 65 hash0 = hash;
47 } 66 }
48 return (hash0 << 1); 67 return hash0 << 1;
49} 68}
50 69
51static void str2hashbuf(const char *msg, int len, __u32 *buf, int num) 70static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num)
52{ 71{
53 __u32 pad, val; 72 __u32 pad, val;
54 int i; 73 int i;
74 const signed char *scp = (const signed char *) msg;
75
76 pad = (__u32)len | ((__u32)len << 8);
77 pad |= pad << 16;
78
79 val = pad;
80 if (len > num*4)
81 len = num * 4;
82 for (i = 0; i < len; i++) {
83 if ((i % 4) == 0)
84 val = pad;
85 val = ((int) scp[i]) + (val << 8);
86 if ((i % 4) == 3) {
87 *buf++ = val;
88 val = pad;
89 num--;
90 }
91 }
92 if (--num >= 0)
93 *buf++ = val;
94 while (--num >= 0)
95 *buf++ = pad;
96}
97
98static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num)
99{
100 __u32 pad, val;
101 int i;
102 const unsigned char *ucp = (const unsigned char *) msg;
55 103
56 pad = (__u32)len | ((__u32)len << 8); 104 pad = (__u32)len | ((__u32)len << 8);
57 pad |= pad << 16; 105 pad |= pad << 16;
@@ -62,7 +110,7 @@ static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
62 for (i=0; i < len; i++) { 110 for (i=0; i < len; i++) {
63 if ((i % 4) == 0) 111 if ((i % 4) == 0)
64 val = pad; 112 val = pad;
65 val = msg[i] + (val << 8); 113 val = ((int) ucp[i]) + (val << 8);
66 if ((i % 4) == 3) { 114 if ((i % 4) == 3) {
67 *buf++ = val; 115 *buf++ = val;
68 val = pad; 116 val = pad;
@@ -95,6 +143,8 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
95 const char *p; 143 const char *p;
96 int i; 144 int i;
97 __u32 in[8], buf[4]; 145 __u32 in[8], buf[4];
146 void (*str2hashbuf)(const char *, int, __u32 *, int) =
147 str2hashbuf_signed;
98 148
99 /* Initialize the default seed for the hash checksum functions */ 149 /* Initialize the default seed for the hash checksum functions */
100 buf[0] = 0x67452301; 150 buf[0] = 0x67452301;
@@ -113,13 +163,18 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
113 } 163 }
114 164
115 switch (hinfo->hash_version) { 165 switch (hinfo->hash_version) {
166 case DX_HASH_LEGACY_UNSIGNED:
167 hash = dx_hack_hash_unsigned(name, len);
168 break;
116 case DX_HASH_LEGACY: 169 case DX_HASH_LEGACY:
117 hash = dx_hack_hash(name, len); 170 hash = dx_hack_hash_signed(name, len);
118 break; 171 break;
172 case DX_HASH_HALF_MD4_UNSIGNED:
173 str2hashbuf = str2hashbuf_unsigned;
119 case DX_HASH_HALF_MD4: 174 case DX_HASH_HALF_MD4:
120 p = name; 175 p = name;
121 while (len > 0) { 176 while (len > 0) {
122 str2hashbuf(p, len, in, 8); 177 (*str2hashbuf)(p, len, in, 8);
123 half_md4_transform(buf, in); 178 half_md4_transform(buf, in);
124 len -= 32; 179 len -= 32;
125 p += 32; 180 p += 32;
@@ -127,10 +182,12 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
127 minor_hash = buf[2]; 182 minor_hash = buf[2];
128 hash = buf[1]; 183 hash = buf[1];
129 break; 184 break;
185 case DX_HASH_TEA_UNSIGNED:
186 str2hashbuf = str2hashbuf_unsigned;
130 case DX_HASH_TEA: 187 case DX_HASH_TEA:
131 p = name; 188 p = name;
132 while (len > 0) { 189 while (len > 0) {
133 str2hashbuf(p, len, in, 4); 190 (*str2hashbuf)(p, len, in, 4);
134 TEA_transform(buf, in); 191 TEA_transform(buf, in);
135 len -= 16; 192 len -= 16;
136 p += 16; 193 p += 16;
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 490bd0ed7896..8de6c720e510 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -559,12 +559,8 @@ got:
559 ei->i_dir_start_lookup = 0; 559 ei->i_dir_start_lookup = 0;
560 ei->i_disksize = 0; 560 ei->i_disksize = 0;
561 561
562 ei->i_flags = EXT3_I(dir)->i_flags & ~EXT3_INDEX_FL; 562 ei->i_flags =
563 if (S_ISLNK(mode)) 563 ext3_mask_flags(mode, EXT3_I(dir)->i_flags & EXT3_FL_INHERITED);
564 ei->i_flags &= ~(EXT3_IMMUTABLE_FL|EXT3_APPEND_FL);
565 /* dirsync only applies to directories */
566 if (!S_ISDIR(mode))
567 ei->i_flags &= ~EXT3_DIRSYNC_FL;
568#ifdef EXT3_FRAGMENTS 564#ifdef EXT3_FRAGMENTS
569 ei->i_faddr = 0; 565 ei->i_faddr = 0;
570 ei->i_frag_no = 0; 566 ei->i_frag_no = 0;
@@ -579,7 +575,10 @@ got:
579 ext3_set_inode_flags(inode); 575 ext3_set_inode_flags(inode);
580 if (IS_DIRSYNC(inode)) 576 if (IS_DIRSYNC(inode))
581 handle->h_sync = 1; 577 handle->h_sync = 1;
582 insert_inode_hash(inode); 578 if (insert_inode_locked(inode) < 0) {
579 err = -EINVAL;
580 goto fail_drop;
581 }
583 spin_lock(&sbi->s_next_gen_lock); 582 spin_lock(&sbi->s_next_gen_lock);
584 inode->i_generation = sbi->s_next_generation++; 583 inode->i_generation = sbi->s_next_generation++;
585 spin_unlock(&sbi->s_next_gen_lock); 584 spin_unlock(&sbi->s_next_gen_lock);
@@ -627,6 +626,7 @@ fail_drop:
627 DQUOT_DROP(inode); 626 DQUOT_DROP(inode);
628 inode->i_flags |= S_NOQUOTA; 627 inode->i_flags |= S_NOQUOTA;
629 inode->i_nlink = 0; 628 inode->i_nlink = 0;
629 unlock_new_inode(inode);
630 iput(inode); 630 iput(inode);
631 brelse(bitmap_bh); 631 brelse(bitmap_bh);
632 return ERR_PTR(err); 632 return ERR_PTR(err);
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index f8424ad89971..5fa453b49a64 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -37,6 +37,7 @@
37#include <linux/uio.h> 37#include <linux/uio.h>
38#include <linux/bio.h> 38#include <linux/bio.h>
39#include <linux/fiemap.h> 39#include <linux/fiemap.h>
40#include <linux/namei.h>
40#include "xattr.h" 41#include "xattr.h"
41#include "acl.h" 42#include "acl.h"
42 43
@@ -1160,7 +1161,7 @@ static int ext3_write_begin(struct file *file, struct address_space *mapping,
1160 to = from + len; 1161 to = from + len;
1161 1162
1162retry: 1163retry:
1163 page = __grab_cache_page(mapping, index); 1164 page = grab_cache_page_write_begin(mapping, index, flags);
1164 if (!page) 1165 if (!page)
1165 return -ENOMEM; 1166 return -ENOMEM;
1166 *pagep = page; 1167 *pagep = page;
@@ -2817,9 +2818,11 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
2817 inode->i_op = &ext3_dir_inode_operations; 2818 inode->i_op = &ext3_dir_inode_operations;
2818 inode->i_fop = &ext3_dir_operations; 2819 inode->i_fop = &ext3_dir_operations;
2819 } else if (S_ISLNK(inode->i_mode)) { 2820 } else if (S_ISLNK(inode->i_mode)) {
2820 if (ext3_inode_is_fast_symlink(inode)) 2821 if (ext3_inode_is_fast_symlink(inode)) {
2821 inode->i_op = &ext3_fast_symlink_inode_operations; 2822 inode->i_op = &ext3_fast_symlink_inode_operations;
2822 else { 2823 nd_terminate_link(ei->i_data, inode->i_size,
2824 sizeof(ei->i_data) - 1);
2825 } else {
2823 inode->i_op = &ext3_symlink_inode_operations; 2826 inode->i_op = &ext3_symlink_inode_operations;
2824 ext3_set_aops(inode); 2827 ext3_set_aops(inode);
2825 } 2828 }
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index b7394d05ee8e..5e86ce9a86e0 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -53,8 +53,7 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
53 goto flags_out; 53 goto flags_out;
54 } 54 }
55 55
56 if (!S_ISDIR(inode->i_mode)) 56 flags = ext3_mask_flags(inode->i_mode, flags);
57 flags &= ~EXT3_DIRSYNC_FL;
58 57
59 mutex_lock(&inode->i_mutex); 58 mutex_lock(&inode->i_mutex);
60 /* Is it quota file? Do not allow user to mess with it */ 59 /* Is it quota file? Do not allow user to mess with it */
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 3e5edc92aa0b..4db4ffa1edad 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -74,10 +74,6 @@ static struct buffer_head *ext3_append(handle_t *handle,
74#define assert(test) J_ASSERT(test) 74#define assert(test) J_ASSERT(test)
75#endif 75#endif
76 76
77#ifndef swap
78#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0)
79#endif
80
81#ifdef DX_DEBUG 77#ifdef DX_DEBUG
82#define dxtrace(command) command 78#define dxtrace(command) command
83#else 79#else
@@ -368,6 +364,8 @@ dx_probe(struct qstr *entry, struct inode *dir,
368 goto fail; 364 goto fail;
369 } 365 }
370 hinfo->hash_version = root->info.hash_version; 366 hinfo->hash_version = root->info.hash_version;
367 if (hinfo->hash_version <= DX_HASH_TEA)
368 hinfo->hash_version += EXT3_SB(dir->i_sb)->s_hash_unsigned;
371 hinfo->seed = EXT3_SB(dir->i_sb)->s_hash_seed; 369 hinfo->seed = EXT3_SB(dir->i_sb)->s_hash_seed;
372 if (entry) 370 if (entry)
373 ext3fs_dirhash(entry->name, entry->len, hinfo); 371 ext3fs_dirhash(entry->name, entry->len, hinfo);
@@ -636,6 +634,9 @@ int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash,
636 dir = dir_file->f_path.dentry->d_inode; 634 dir = dir_file->f_path.dentry->d_inode;
637 if (!(EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) { 635 if (!(EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) {
638 hinfo.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version; 636 hinfo.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version;
637 if (hinfo.hash_version <= DX_HASH_TEA)
638 hinfo.hash_version +=
639 EXT3_SB(dir->i_sb)->s_hash_unsigned;
639 hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed; 640 hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed;
640 count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo, 641 count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo,
641 start_hash, start_minor_hash); 642 start_hash, start_minor_hash);
@@ -1156,9 +1157,9 @@ static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
1156 u32 hash2; 1157 u32 hash2;
1157 struct dx_map_entry *map; 1158 struct dx_map_entry *map;
1158 char *data1 = (*bh)->b_data, *data2; 1159 char *data1 = (*bh)->b_data, *data2;
1159 unsigned split, move, size, i; 1160 unsigned split, move, size;
1160 struct ext3_dir_entry_2 *de = NULL, *de2; 1161 struct ext3_dir_entry_2 *de = NULL, *de2;
1161 int err = 0; 1162 int err = 0, i;
1162 1163
1163 bh2 = ext3_append (handle, dir, &newblock, &err); 1164 bh2 = ext3_append (handle, dir, &newblock, &err);
1164 if (!(bh2)) { 1165 if (!(bh2)) {
@@ -1357,7 +1358,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1357 struct fake_dirent *fde; 1358 struct fake_dirent *fde;
1358 1359
1359 blocksize = dir->i_sb->s_blocksize; 1360 blocksize = dir->i_sb->s_blocksize;
1360 dxtrace(printk("Creating index\n")); 1361 dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino));
1361 retval = ext3_journal_get_write_access(handle, bh); 1362 retval = ext3_journal_get_write_access(handle, bh);
1362 if (retval) { 1363 if (retval) {
1363 ext3_std_error(dir->i_sb, retval); 1364 ext3_std_error(dir->i_sb, retval);
@@ -1366,6 +1367,19 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1366 } 1367 }
1367 root = (struct dx_root *) bh->b_data; 1368 root = (struct dx_root *) bh->b_data;
1368 1369
1370 /* The 0th block becomes the root, move the dirents out */
1371 fde = &root->dotdot;
1372 de = (struct ext3_dir_entry_2 *)((char *)fde +
1373 ext3_rec_len_from_disk(fde->rec_len));
1374 if ((char *) de >= (((char *) root) + blocksize)) {
1375 ext3_error(dir->i_sb, __func__,
1376 "invalid rec_len for '..' in inode %lu",
1377 dir->i_ino);
1378 brelse(bh);
1379 return -EIO;
1380 }
1381 len = ((char *) root) + blocksize - (char *) de;
1382
1369 bh2 = ext3_append (handle, dir, &block, &retval); 1383 bh2 = ext3_append (handle, dir, &block, &retval);
1370 if (!(bh2)) { 1384 if (!(bh2)) {
1371 brelse(bh); 1385 brelse(bh);
@@ -1374,11 +1388,6 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1374 EXT3_I(dir)->i_flags |= EXT3_INDEX_FL; 1388 EXT3_I(dir)->i_flags |= EXT3_INDEX_FL;
1375 data1 = bh2->b_data; 1389 data1 = bh2->b_data;
1376 1390
1377 /* The 0th block becomes the root, move the dirents out */
1378 fde = &root->dotdot;
1379 de = (struct ext3_dir_entry_2 *)((char *)fde +
1380 ext3_rec_len_from_disk(fde->rec_len));
1381 len = ((char *) root) + blocksize - (char *) de;
1382 memcpy (data1, de, len); 1391 memcpy (data1, de, len);
1383 de = (struct ext3_dir_entry_2 *) data1; 1392 de = (struct ext3_dir_entry_2 *) data1;
1384 top = data1 + len; 1393 top = data1 + len;
@@ -1398,6 +1407,8 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1398 1407
1399 /* Initialize as for dx_probe */ 1408 /* Initialize as for dx_probe */
1400 hinfo.hash_version = root->info.hash_version; 1409 hinfo.hash_version = root->info.hash_version;
1410 if (hinfo.hash_version <= DX_HASH_TEA)
1411 hinfo.hash_version += EXT3_SB(dir->i_sb)->s_hash_unsigned;
1401 hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed; 1412 hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed;
1402 ext3fs_dirhash(name, namelen, &hinfo); 1413 ext3fs_dirhash(name, namelen, &hinfo);
1403 frame = frames; 1414 frame = frames;
@@ -1652,9 +1663,11 @@ static int ext3_add_nondir(handle_t *handle,
1652 if (!err) { 1663 if (!err) {
1653 ext3_mark_inode_dirty(handle, inode); 1664 ext3_mark_inode_dirty(handle, inode);
1654 d_instantiate(dentry, inode); 1665 d_instantiate(dentry, inode);
1666 unlock_new_inode(inode);
1655 return 0; 1667 return 0;
1656 } 1668 }
1657 drop_nlink(inode); 1669 drop_nlink(inode);
1670 unlock_new_inode(inode);
1658 iput(inode); 1671 iput(inode);
1659 return err; 1672 return err;
1660} 1673}
@@ -1765,6 +1778,7 @@ retry:
1765 dir_block = ext3_bread (handle, inode, 0, 1, &err); 1778 dir_block = ext3_bread (handle, inode, 0, 1, &err);
1766 if (!dir_block) { 1779 if (!dir_block) {
1767 drop_nlink(inode); /* is this nlink == 0? */ 1780 drop_nlink(inode); /* is this nlink == 0? */
1781 unlock_new_inode(inode);
1768 ext3_mark_inode_dirty(handle, inode); 1782 ext3_mark_inode_dirty(handle, inode);
1769 iput (inode); 1783 iput (inode);
1770 goto out_stop; 1784 goto out_stop;
@@ -1792,6 +1806,7 @@ retry:
1792 err = ext3_add_entry (handle, dentry, inode); 1806 err = ext3_add_entry (handle, dentry, inode);
1793 if (err) { 1807 if (err) {
1794 inode->i_nlink = 0; 1808 inode->i_nlink = 0;
1809 unlock_new_inode(inode);
1795 ext3_mark_inode_dirty(handle, inode); 1810 ext3_mark_inode_dirty(handle, inode);
1796 iput (inode); 1811 iput (inode);
1797 goto out_stop; 1812 goto out_stop;
@@ -1800,6 +1815,7 @@ retry:
1800 ext3_update_dx_flag(dir); 1815 ext3_update_dx_flag(dir);
1801 ext3_mark_inode_dirty(handle, dir); 1816 ext3_mark_inode_dirty(handle, dir);
1802 d_instantiate(dentry, inode); 1817 d_instantiate(dentry, inode);
1818 unlock_new_inode(inode);
1803out_stop: 1819out_stop:
1804 ext3_journal_stop(handle); 1820 ext3_journal_stop(handle);
1805 if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) 1821 if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
@@ -2170,10 +2186,10 @@ retry:
2170 * We have a transaction open. All is sweetness. It also sets 2186 * We have a transaction open. All is sweetness. It also sets
2171 * i_size in generic_commit_write(). 2187 * i_size in generic_commit_write().
2172 */ 2188 */
2173 err = __page_symlink(inode, symname, l, 2189 err = __page_symlink(inode, symname, l, 1);
2174 mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
2175 if (err) { 2190 if (err) {
2176 drop_nlink(inode); 2191 drop_nlink(inode);
2192 unlock_new_inode(inode);
2177 ext3_mark_inode_dirty(handle, inode); 2193 ext3_mark_inode_dirty(handle, inode);
2178 iput (inode); 2194 iput (inode);
2179 goto out_stop; 2195 goto out_stop;
@@ -2221,7 +2237,14 @@ retry:
2221 inc_nlink(inode); 2237 inc_nlink(inode);
2222 atomic_inc(&inode->i_count); 2238 atomic_inc(&inode->i_count);
2223 2239
2224 err = ext3_add_nondir(handle, dentry, inode); 2240 err = ext3_add_entry(handle, dentry, inode);
2241 if (!err) {
2242 ext3_mark_inode_dirty(handle, inode);
2243 d_instantiate(dentry, inode);
2244 } else {
2245 drop_nlink(inode);
2246 iput(inode);
2247 }
2225 ext3_journal_stop(handle); 2248 ext3_journal_stop(handle);
2226 if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) 2249 if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
2227 goto retry; 2250 goto retry;
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index f6c94f232ec1..b70d90e08a3c 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -48,8 +48,8 @@ static int ext3_load_journal(struct super_block *, struct ext3_super_block *,
48 unsigned long journal_devnum); 48 unsigned long journal_devnum);
49static int ext3_create_journal(struct super_block *, struct ext3_super_block *, 49static int ext3_create_journal(struct super_block *, struct ext3_super_block *,
50 unsigned int); 50 unsigned int);
51static void ext3_commit_super (struct super_block * sb, 51static int ext3_commit_super(struct super_block *sb,
52 struct ext3_super_block * es, 52 struct ext3_super_block *es,
53 int sync); 53 int sync);
54static void ext3_mark_recovery_complete(struct super_block * sb, 54static void ext3_mark_recovery_complete(struct super_block * sb,
55 struct ext3_super_block * es); 55 struct ext3_super_block * es);
@@ -60,9 +60,9 @@ static const char *ext3_decode_error(struct super_block * sb, int errno,
60 char nbuf[16]); 60 char nbuf[16]);
61static int ext3_remount (struct super_block * sb, int * flags, char * data); 61static int ext3_remount (struct super_block * sb, int * flags, char * data);
62static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf); 62static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf);
63static void ext3_unlockfs(struct super_block *sb); 63static int ext3_unfreeze(struct super_block *sb);
64static void ext3_write_super (struct super_block * sb); 64static void ext3_write_super (struct super_block * sb);
65static void ext3_write_super_lockfs(struct super_block *sb); 65static int ext3_freeze(struct super_block *sb);
66 66
67/* 67/*
68 * Wrappers for journal_start/end. 68 * Wrappers for journal_start/end.
@@ -439,6 +439,7 @@ static void ext3_put_super (struct super_block * sb)
439 ext3_blkdev_remove(sbi); 439 ext3_blkdev_remove(sbi);
440 } 440 }
441 sb->s_fs_info = NULL; 441 sb->s_fs_info = NULL;
442 kfree(sbi->s_blockgroup_lock);
442 kfree(sbi); 443 kfree(sbi);
443 return; 444 return;
444} 445}
@@ -682,6 +683,26 @@ static struct dentry *ext3_fh_to_parent(struct super_block *sb, struct fid *fid,
682 ext3_nfs_get_inode); 683 ext3_nfs_get_inode);
683} 684}
684 685
686/*
687 * Try to release metadata pages (indirect blocks, directories) which are
688 * mapped via the block device. Since these pages could have journal heads
689 * which would prevent try_to_free_buffers() from freeing them, we must use
690 * jbd layer's try_to_free_buffers() function to release them.
691 */
692static int bdev_try_to_free_page(struct super_block *sb, struct page *page,
693 gfp_t wait)
694{
695 journal_t *journal = EXT3_SB(sb)->s_journal;
696
697 WARN_ON(PageChecked(page));
698 if (!page_has_buffers(page))
699 return 0;
700 if (journal)
701 return journal_try_to_free_buffers(journal, page,
702 wait & ~__GFP_WAIT);
703 return try_to_free_buffers(page);
704}
705
685#ifdef CONFIG_QUOTA 706#ifdef CONFIG_QUOTA
686#define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group") 707#define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group")
687#define QTYPE2MOPT(on, t) ((t)==USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA)) 708#define QTYPE2MOPT(on, t) ((t)==USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
@@ -713,7 +734,9 @@ static struct dquot_operations ext3_quota_operations = {
713 .acquire_dquot = ext3_acquire_dquot, 734 .acquire_dquot = ext3_acquire_dquot,
714 .release_dquot = ext3_release_dquot, 735 .release_dquot = ext3_release_dquot,
715 .mark_dirty = ext3_mark_dquot_dirty, 736 .mark_dirty = ext3_mark_dquot_dirty,
716 .write_info = ext3_write_info 737 .write_info = ext3_write_info,
738 .alloc_dquot = dquot_alloc,
739 .destroy_dquot = dquot_destroy,
717}; 740};
718 741
719static struct quotactl_ops ext3_qctl_operations = { 742static struct quotactl_ops ext3_qctl_operations = {
@@ -736,8 +759,8 @@ static const struct super_operations ext3_sops = {
736 .put_super = ext3_put_super, 759 .put_super = ext3_put_super,
737 .write_super = ext3_write_super, 760 .write_super = ext3_write_super,
738 .sync_fs = ext3_sync_fs, 761 .sync_fs = ext3_sync_fs,
739 .write_super_lockfs = ext3_write_super_lockfs, 762 .freeze_fs = ext3_freeze,
740 .unlockfs = ext3_unlockfs, 763 .unfreeze_fs = ext3_unfreeze,
741 .statfs = ext3_statfs, 764 .statfs = ext3_statfs,
742 .remount_fs = ext3_remount, 765 .remount_fs = ext3_remount,
743 .clear_inode = ext3_clear_inode, 766 .clear_inode = ext3_clear_inode,
@@ -746,6 +769,7 @@ static const struct super_operations ext3_sops = {
746 .quota_read = ext3_quota_read, 769 .quota_read = ext3_quota_read,
747 .quota_write = ext3_quota_write, 770 .quota_write = ext3_quota_write,
748#endif 771#endif
772 .bdev_try_to_free_page = bdev_try_to_free_page,
749}; 773};
750 774
751static const struct export_operations ext3_export_ops = { 775static const struct export_operations ext3_export_ops = {
@@ -1035,8 +1059,7 @@ static int parse_options (char *options, struct super_block *sb,
1035 case Opt_grpjquota: 1059 case Opt_grpjquota:
1036 qtype = GRPQUOTA; 1060 qtype = GRPQUOTA;
1037set_qf_name: 1061set_qf_name:
1038 if ((sb_any_quota_enabled(sb) || 1062 if (sb_any_quota_loaded(sb) &&
1039 sb_any_quota_suspended(sb)) &&
1040 !sbi->s_qf_names[qtype]) { 1063 !sbi->s_qf_names[qtype]) {
1041 printk(KERN_ERR 1064 printk(KERN_ERR
1042 "EXT3-fs: Cannot change journaled " 1065 "EXT3-fs: Cannot change journaled "
@@ -1075,8 +1098,7 @@ set_qf_name:
1075 case Opt_offgrpjquota: 1098 case Opt_offgrpjquota:
1076 qtype = GRPQUOTA; 1099 qtype = GRPQUOTA;
1077clear_qf_name: 1100clear_qf_name:
1078 if ((sb_any_quota_enabled(sb) || 1101 if (sb_any_quota_loaded(sb) &&
1079 sb_any_quota_suspended(sb)) &&
1080 sbi->s_qf_names[qtype]) { 1102 sbi->s_qf_names[qtype]) {
1081 printk(KERN_ERR "EXT3-fs: Cannot change " 1103 printk(KERN_ERR "EXT3-fs: Cannot change "
1082 "journaled quota options when " 1104 "journaled quota options when "
@@ -1095,8 +1117,7 @@ clear_qf_name:
1095 case Opt_jqfmt_vfsv0: 1117 case Opt_jqfmt_vfsv0:
1096 qfmt = QFMT_VFS_V0; 1118 qfmt = QFMT_VFS_V0;
1097set_qf_format: 1119set_qf_format:
1098 if ((sb_any_quota_enabled(sb) || 1120 if (sb_any_quota_loaded(sb) &&
1099 sb_any_quota_suspended(sb)) &&
1100 sbi->s_jquota_fmt != qfmt) { 1121 sbi->s_jquota_fmt != qfmt) {
1101 printk(KERN_ERR "EXT3-fs: Cannot change " 1122 printk(KERN_ERR "EXT3-fs: Cannot change "
1102 "journaled quota options when " 1123 "journaled quota options when "
@@ -1115,8 +1136,7 @@ set_qf_format:
1115 set_opt(sbi->s_mount_opt, GRPQUOTA); 1136 set_opt(sbi->s_mount_opt, GRPQUOTA);
1116 break; 1137 break;
1117 case Opt_noquota: 1138 case Opt_noquota:
1118 if (sb_any_quota_enabled(sb) || 1139 if (sb_any_quota_loaded(sb)) {
1119 sb_any_quota_suspended(sb)) {
1120 printk(KERN_ERR "EXT3-fs: Cannot change quota " 1140 printk(KERN_ERR "EXT3-fs: Cannot change quota "
1121 "options when quota turned on.\n"); 1141 "options when quota turned on.\n");
1122 return 0; 1142 return 0;
@@ -1548,6 +1568,13 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1548 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); 1568 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
1549 if (!sbi) 1569 if (!sbi)
1550 return -ENOMEM; 1570 return -ENOMEM;
1571
1572 sbi->s_blockgroup_lock =
1573 kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
1574 if (!sbi->s_blockgroup_lock) {
1575 kfree(sbi);
1576 return -ENOMEM;
1577 }
1551 sb->s_fs_info = sbi; 1578 sb->s_fs_info = sbi;
1552 sbi->s_mount_opt = 0; 1579 sbi->s_mount_opt = 0;
1553 sbi->s_resuid = EXT3_DEF_RESUID; 1580 sbi->s_resuid = EXT3_DEF_RESUID;
@@ -1744,6 +1771,18 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1744 for (i=0; i < 4; i++) 1771 for (i=0; i < 4; i++)
1745 sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); 1772 sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
1746 sbi->s_def_hash_version = es->s_def_hash_version; 1773 sbi->s_def_hash_version = es->s_def_hash_version;
1774 i = le32_to_cpu(es->s_flags);
1775 if (i & EXT2_FLAGS_UNSIGNED_HASH)
1776 sbi->s_hash_unsigned = 3;
1777 else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
1778#ifdef __CHAR_UNSIGNED__
1779 es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
1780 sbi->s_hash_unsigned = 3;
1781#else
1782 es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
1783#endif
1784 sb->s_dirt = 1;
1785 }
1747 1786
1748 if (sbi->s_blocks_per_group > blocksize * 8) { 1787 if (sbi->s_blocks_per_group > blocksize * 8) {
1749 printk (KERN_ERR 1788 printk (KERN_ERR
@@ -1788,7 +1827,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1788 goto failed_mount; 1827 goto failed_mount;
1789 } 1828 }
1790 1829
1791 bgl_lock_init(&sbi->s_blockgroup_lock); 1830 bgl_lock_init(sbi->s_blockgroup_lock);
1792 1831
1793 for (i = 0; i < db_count; i++) { 1832 for (i = 0; i < db_count; i++) {
1794 block = descriptor_loc(sb, logic_sb_block, i); 1833 block = descriptor_loc(sb, logic_sb_block, i);
@@ -2272,21 +2311,23 @@ static int ext3_create_journal(struct super_block * sb,
2272 return 0; 2311 return 0;
2273} 2312}
2274 2313
2275static void ext3_commit_super (struct super_block * sb, 2314static int ext3_commit_super(struct super_block *sb,
2276 struct ext3_super_block * es, 2315 struct ext3_super_block *es,
2277 int sync) 2316 int sync)
2278{ 2317{
2279 struct buffer_head *sbh = EXT3_SB(sb)->s_sbh; 2318 struct buffer_head *sbh = EXT3_SB(sb)->s_sbh;
2319 int error = 0;
2280 2320
2281 if (!sbh) 2321 if (!sbh)
2282 return; 2322 return error;
2283 es->s_wtime = cpu_to_le32(get_seconds()); 2323 es->s_wtime = cpu_to_le32(get_seconds());
2284 es->s_free_blocks_count = cpu_to_le32(ext3_count_free_blocks(sb)); 2324 es->s_free_blocks_count = cpu_to_le32(ext3_count_free_blocks(sb));
2285 es->s_free_inodes_count = cpu_to_le32(ext3_count_free_inodes(sb)); 2325 es->s_free_inodes_count = cpu_to_le32(ext3_count_free_inodes(sb));
2286 BUFFER_TRACE(sbh, "marking dirty"); 2326 BUFFER_TRACE(sbh, "marking dirty");
2287 mark_buffer_dirty(sbh); 2327 mark_buffer_dirty(sbh);
2288 if (sync) 2328 if (sync)
2289 sync_dirty_buffer(sbh); 2329 error = sync_dirty_buffer(sbh);
2330 return error;
2290} 2331}
2291 2332
2292 2333
@@ -2400,12 +2441,14 @@ static int ext3_sync_fs(struct super_block *sb, int wait)
2400 * LVM calls this function before a (read-only) snapshot is created. This 2441 * LVM calls this function before a (read-only) snapshot is created. This
2401 * gives us a chance to flush the journal completely and mark the fs clean. 2442 * gives us a chance to flush the journal completely and mark the fs clean.
2402 */ 2443 */
2403static void ext3_write_super_lockfs(struct super_block *sb) 2444static int ext3_freeze(struct super_block *sb)
2404{ 2445{
2446 int error = 0;
2447 journal_t *journal;
2405 sb->s_dirt = 0; 2448 sb->s_dirt = 0;
2406 2449
2407 if (!(sb->s_flags & MS_RDONLY)) { 2450 if (!(sb->s_flags & MS_RDONLY)) {
2408 journal_t *journal = EXT3_SB(sb)->s_journal; 2451 journal = EXT3_SB(sb)->s_journal;
2409 2452
2410 /* Now we set up the journal barrier. */ 2453 /* Now we set up the journal barrier. */
2411 journal_lock_updates(journal); 2454 journal_lock_updates(journal);
@@ -2414,20 +2457,28 @@ static void ext3_write_super_lockfs(struct super_block *sb)
2414 * We don't want to clear needs_recovery flag when we failed 2457 * We don't want to clear needs_recovery flag when we failed
2415 * to flush the journal. 2458 * to flush the journal.
2416 */ 2459 */
2417 if (journal_flush(journal) < 0) 2460 error = journal_flush(journal);
2418 return; 2461 if (error < 0)
2462 goto out;
2419 2463
2420 /* Journal blocked and flushed, clear needs_recovery flag. */ 2464 /* Journal blocked and flushed, clear needs_recovery flag. */
2421 EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); 2465 EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
2422 ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1); 2466 error = ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1);
2467 if (error)
2468 goto out;
2423 } 2469 }
2470 return 0;
2471
2472out:
2473 journal_unlock_updates(journal);
2474 return error;
2424} 2475}
2425 2476
2426/* 2477/*
2427 * Called by LVM after the snapshot is done. We need to reset the RECOVER 2478 * Called by LVM after the snapshot is done. We need to reset the RECOVER
2428 * flag here, even though the filesystem is not technically dirty yet. 2479 * flag here, even though the filesystem is not technically dirty yet.
2429 */ 2480 */
2430static void ext3_unlockfs(struct super_block *sb) 2481static int ext3_unfreeze(struct super_block *sb)
2431{ 2482{
2432 if (!(sb->s_flags & MS_RDONLY)) { 2483 if (!(sb->s_flags & MS_RDONLY)) {
2433 lock_super(sb); 2484 lock_super(sb);
@@ -2437,6 +2488,7 @@ static void ext3_unlockfs(struct super_block *sb)
2437 unlock_super(sb); 2488 unlock_super(sb);
2438 journal_unlock_updates(EXT3_SB(sb)->s_journal); 2489 journal_unlock_updates(EXT3_SB(sb)->s_journal);
2439 } 2490 }
2491 return 0;
2440} 2492}
2441 2493
2442static int ext3_remount (struct super_block * sb, int * flags, char * data) 2494static int ext3_remount (struct super_block * sb, int * flags, char * data)
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 38b3acf5683b..9a50b8052dcf 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -20,6 +20,7 @@
20#include "ext4.h" 20#include "ext4.h"
21#include "ext4_jbd2.h" 21#include "ext4_jbd2.h"
22#include "group.h" 22#include "group.h"
23#include "mballoc.h"
23 24
24/* 25/*
25 * balloc.c contains the blocks allocation and deallocation routines 26 * balloc.c contains the blocks allocation and deallocation routines
@@ -100,10 +101,10 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
100 * essentially implementing a per-group read-only flag. */ 101 * essentially implementing a per-group read-only flag. */
101 if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { 102 if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
102 ext4_error(sb, __func__, 103 ext4_error(sb, __func__,
103 "Checksum bad for group %lu\n", block_group); 104 "Checksum bad for group %u", block_group);
104 gdp->bg_free_blocks_count = 0; 105 ext4_free_blks_set(sb, gdp, 0);
105 gdp->bg_free_inodes_count = 0; 106 ext4_free_inodes_set(sb, gdp, 0);
106 gdp->bg_itable_unused = 0; 107 ext4_itable_unused_set(sb, gdp, 0);
107 memset(bh->b_data, 0xff, sb->s_blocksize); 108 memset(bh->b_data, 0xff, sb->s_blocksize);
108 return 0; 109 return 0;
109 } 110 }
@@ -205,15 +206,15 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
205 ext4_group_t block_group, 206 ext4_group_t block_group,
206 struct buffer_head **bh) 207 struct buffer_head **bh)
207{ 208{
208 unsigned long group_desc; 209 unsigned int group_desc;
209 unsigned long offset; 210 unsigned int offset;
210 struct ext4_group_desc *desc; 211 struct ext4_group_desc *desc;
211 struct ext4_sb_info *sbi = EXT4_SB(sb); 212 struct ext4_sb_info *sbi = EXT4_SB(sb);
212 213
213 if (block_group >= sbi->s_groups_count) { 214 if (block_group >= sbi->s_groups_count) {
214 ext4_error(sb, "ext4_get_group_desc", 215 ext4_error(sb, "ext4_get_group_desc",
215 "block_group >= groups_count - " 216 "block_group >= groups_count - "
216 "block_group = %lu, groups_count = %lu", 217 "block_group = %u, groups_count = %u",
217 block_group, sbi->s_groups_count); 218 block_group, sbi->s_groups_count);
218 219
219 return NULL; 220 return NULL;
@@ -225,7 +226,7 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
225 if (!sbi->s_group_desc[group_desc]) { 226 if (!sbi->s_group_desc[group_desc]) {
226 ext4_error(sb, "ext4_get_group_desc", 227 ext4_error(sb, "ext4_get_group_desc",
227 "Group descriptor not loaded - " 228 "Group descriptor not loaded - "
228 "block_group = %lu, group_desc = %lu, desc = %lu", 229 "block_group = %u, group_desc = %u, desc = %u",
229 block_group, group_desc, offset); 230 block_group, group_desc, offset);
230 return NULL; 231 return NULL;
231 } 232 }
@@ -315,29 +316,50 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
315 if (unlikely(!bh)) { 316 if (unlikely(!bh)) {
316 ext4_error(sb, __func__, 317 ext4_error(sb, __func__,
317 "Cannot read block bitmap - " 318 "Cannot read block bitmap - "
318 "block_group = %lu, block_bitmap = %llu", 319 "block_group = %u, block_bitmap = %llu",
319 block_group, bitmap_blk); 320 block_group, bitmap_blk);
320 return NULL; 321 return NULL;
321 } 322 }
322 if (buffer_uptodate(bh) && 323
323 !(desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) 324 if (bitmap_uptodate(bh))
324 return bh; 325 return bh;
325 326
326 lock_buffer(bh); 327 lock_buffer(bh);
328 if (bitmap_uptodate(bh)) {
329 unlock_buffer(bh);
330 return bh;
331 }
327 spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group)); 332 spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
328 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 333 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
329 ext4_init_block_bitmap(sb, bh, block_group, desc); 334 ext4_init_block_bitmap(sb, bh, block_group, desc);
335 set_bitmap_uptodate(bh);
330 set_buffer_uptodate(bh); 336 set_buffer_uptodate(bh);
331 unlock_buffer(bh);
332 spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); 337 spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
338 unlock_buffer(bh);
333 return bh; 339 return bh;
334 } 340 }
335 spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); 341 spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
342 if (buffer_uptodate(bh)) {
343 /*
344 * if not uninit if bh is uptodate,
345 * bitmap is also uptodate
346 */
347 set_bitmap_uptodate(bh);
348 unlock_buffer(bh);
349 return bh;
350 }
351 /*
352 * submit the buffer_head for read. We can
353 * safely mark the bitmap as uptodate now.
354 * We do it here so the bitmap uptodate bit
355 * get set with buffer lock held.
356 */
357 set_bitmap_uptodate(bh);
336 if (bh_submit_read(bh) < 0) { 358 if (bh_submit_read(bh) < 0) {
337 put_bh(bh); 359 put_bh(bh);
338 ext4_error(sb, __func__, 360 ext4_error(sb, __func__,
339 "Cannot read block bitmap - " 361 "Cannot read block bitmap - "
340 "block_group = %lu, block_bitmap = %llu", 362 "block_group = %u, block_bitmap = %llu",
341 block_group, bitmap_blk); 363 block_group, bitmap_blk);
342 return NULL; 364 return NULL;
343 } 365 }
@@ -350,62 +372,44 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
350} 372}
351 373
352/** 374/**
353 * ext4_free_blocks_sb() -- Free given blocks and update quota 375 * ext4_add_groupblocks() -- Add given blocks to an existing group
354 * @handle: handle to this transaction 376 * @handle: handle to this transaction
355 * @sb: super block 377 * @sb: super block
356 * @block: start physcial block to free 378 * @block: start physcial block to add to the block group
357 * @count: number of blocks to free 379 * @count: number of blocks to free
358 * @pdquot_freed_blocks: pointer to quota
359 * 380 *
360 * XXX This function is only used by the on-line resizing code, which 381 * This marks the blocks as free in the bitmap. We ask the
361 * should probably be fixed up to call the mballoc variant. There 382 * mballoc to reload the buddy after this by setting group
362 * this needs to be cleaned up later; in fact, I'm not convinced this 383 * EXT4_GROUP_INFO_NEED_INIT_BIT flag
363 * is 100% correct in the face of the mballoc code. The online resizing
364 * code needs to be fixed up to more tightly (and correctly) interlock
365 * with the mballoc code.
366 */ 384 */
367void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb, 385void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
368 ext4_fsblk_t block, unsigned long count, 386 ext4_fsblk_t block, unsigned long count)
369 unsigned long *pdquot_freed_blocks)
370{ 387{
371 struct buffer_head *bitmap_bh = NULL; 388 struct buffer_head *bitmap_bh = NULL;
372 struct buffer_head *gd_bh; 389 struct buffer_head *gd_bh;
373 ext4_group_t block_group; 390 ext4_group_t block_group;
374 ext4_grpblk_t bit; 391 ext4_grpblk_t bit;
375 unsigned long i; 392 unsigned int i;
376 unsigned long overflow;
377 struct ext4_group_desc *desc; 393 struct ext4_group_desc *desc;
378 struct ext4_super_block *es; 394 struct ext4_super_block *es;
379 struct ext4_sb_info *sbi; 395 struct ext4_sb_info *sbi;
380 int err = 0, ret; 396 int err = 0, ret, blk_free_count;
381 ext4_grpblk_t group_freed; 397 ext4_grpblk_t blocks_freed;
398 struct ext4_group_info *grp;
382 399
383 *pdquot_freed_blocks = 0;
384 sbi = EXT4_SB(sb); 400 sbi = EXT4_SB(sb);
385 es = sbi->s_es; 401 es = sbi->s_es;
386 if (block < le32_to_cpu(es->s_first_data_block) || 402 ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
387 block + count < block ||
388 block + count > ext4_blocks_count(es)) {
389 ext4_error(sb, "ext4_free_blocks",
390 "Freeing blocks not in datazone - "
391 "block = %llu, count = %lu", block, count);
392 goto error_return;
393 }
394
395 ext4_debug("freeing block(s) %llu-%llu\n", block, block + count - 1);
396 403
397do_more:
398 overflow = 0;
399 ext4_get_group_no_and_offset(sb, block, &block_group, &bit); 404 ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
405 grp = ext4_get_group_info(sb, block_group);
400 /* 406 /*
401 * Check to see if we are freeing blocks across a group 407 * Check to see if we are freeing blocks across a group
402 * boundary. 408 * boundary.
403 */ 409 */
404 if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) { 410 if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
405 overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb); 411 goto error_return;
406 count -= overflow;
407 } 412 }
408 brelse(bitmap_bh);
409 bitmap_bh = ext4_read_block_bitmap(sb, block_group); 413 bitmap_bh = ext4_read_block_bitmap(sb, block_group);
410 if (!bitmap_bh) 414 if (!bitmap_bh)
411 goto error_return; 415 goto error_return;
@@ -418,18 +422,17 @@ do_more:
418 in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) || 422 in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
419 in_range(block + count - 1, ext4_inode_table(sb, desc), 423 in_range(block + count - 1, ext4_inode_table(sb, desc),
420 sbi->s_itb_per_group)) { 424 sbi->s_itb_per_group)) {
421 ext4_error(sb, "ext4_free_blocks", 425 ext4_error(sb, __func__,
422 "Freeing blocks in system zones - " 426 "Adding blocks in system zones - "
423 "Block = %llu, count = %lu", 427 "Block = %llu, count = %lu",
424 block, count); 428 block, count);
425 goto error_return; 429 goto error_return;
426 } 430 }
427 431
428 /* 432 /*
429 * We are about to start releasing blocks in the bitmap, 433 * We are about to add blocks to the bitmap,
430 * so we need undo access. 434 * so we need undo access.
431 */ 435 */
432 /* @@@ check errors */
433 BUFFER_TRACE(bitmap_bh, "getting undo access"); 436 BUFFER_TRACE(bitmap_bh, "getting undo access");
434 err = ext4_journal_get_undo_access(handle, bitmap_bh); 437 err = ext4_journal_get_undo_access(handle, bitmap_bh);
435 if (err) 438 if (err)
@@ -444,107 +447,55 @@ do_more:
444 err = ext4_journal_get_write_access(handle, gd_bh); 447 err = ext4_journal_get_write_access(handle, gd_bh);
445 if (err) 448 if (err)
446 goto error_return; 449 goto error_return;
447 450 /*
448 jbd_lock_bh_state(bitmap_bh); 451 * make sure we don't allow a parallel init on other groups in the
449 452 * same buddy cache
450 for (i = 0, group_freed = 0; i < count; i++) { 453 */
451 /* 454 down_write(&grp->alloc_sem);
452 * An HJ special. This is expensive... 455 for (i = 0, blocks_freed = 0; i < count; i++) {
453 */
454#ifdef CONFIG_JBD2_DEBUG
455 jbd_unlock_bh_state(bitmap_bh);
456 {
457 struct buffer_head *debug_bh;
458 debug_bh = sb_find_get_block(sb, block + i);
459 if (debug_bh) {
460 BUFFER_TRACE(debug_bh, "Deleted!");
461 if (!bh2jh(bitmap_bh)->b_committed_data)
462 BUFFER_TRACE(debug_bh,
463 "No commited data in bitmap");
464 BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap");
465 __brelse(debug_bh);
466 }
467 }
468 jbd_lock_bh_state(bitmap_bh);
469#endif
470 if (need_resched()) {
471 jbd_unlock_bh_state(bitmap_bh);
472 cond_resched();
473 jbd_lock_bh_state(bitmap_bh);
474 }
475 /* @@@ This prevents newly-allocated data from being
476 * freed and then reallocated within the same
477 * transaction.
478 *
479 * Ideally we would want to allow that to happen, but to
480 * do so requires making jbd2_journal_forget() capable of
481 * revoking the queued write of a data block, which
482 * implies blocking on the journal lock. *forget()
483 * cannot block due to truncate races.
484 *
485 * Eventually we can fix this by making jbd2_journal_forget()
486 * return a status indicating whether or not it was able
487 * to revoke the buffer. On successful revoke, it is
488 * safe not to set the allocation bit in the committed
489 * bitmap, because we know that there is no outstanding
490 * activity on the buffer any more and so it is safe to
491 * reallocate it.
492 */
493 BUFFER_TRACE(bitmap_bh, "set in b_committed_data");
494 J_ASSERT_BH(bitmap_bh,
495 bh2jh(bitmap_bh)->b_committed_data != NULL);
496 ext4_set_bit_atomic(sb_bgl_lock(sbi, block_group), bit + i,
497 bh2jh(bitmap_bh)->b_committed_data);
498
499 /*
500 * We clear the bit in the bitmap after setting the committed
501 * data bit, because this is the reverse order to that which
502 * the allocator uses.
503 */
504 BUFFER_TRACE(bitmap_bh, "clear bit"); 456 BUFFER_TRACE(bitmap_bh, "clear bit");
505 if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group), 457 if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
506 bit + i, bitmap_bh->b_data)) { 458 bit + i, bitmap_bh->b_data)) {
507 jbd_unlock_bh_state(bitmap_bh);
508 ext4_error(sb, __func__, 459 ext4_error(sb, __func__,
509 "bit already cleared for block %llu", 460 "bit already cleared for block %llu",
510 (ext4_fsblk_t)(block + i)); 461 (ext4_fsblk_t)(block + i));
511 jbd_lock_bh_state(bitmap_bh);
512 BUFFER_TRACE(bitmap_bh, "bit already cleared"); 462 BUFFER_TRACE(bitmap_bh, "bit already cleared");
513 } else { 463 } else {
514 group_freed++; 464 blocks_freed++;
515 } 465 }
516 } 466 }
517 jbd_unlock_bh_state(bitmap_bh);
518
519 spin_lock(sb_bgl_lock(sbi, block_group)); 467 spin_lock(sb_bgl_lock(sbi, block_group));
520 le16_add_cpu(&desc->bg_free_blocks_count, group_freed); 468 blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc);
469 ext4_free_blks_set(sb, desc, blk_free_count);
521 desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc); 470 desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
522 spin_unlock(sb_bgl_lock(sbi, block_group)); 471 spin_unlock(sb_bgl_lock(sbi, block_group));
523 percpu_counter_add(&sbi->s_freeblocks_counter, count); 472 percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
524 473
525 if (sbi->s_log_groups_per_flex) { 474 if (sbi->s_log_groups_per_flex) {
526 ext4_group_t flex_group = ext4_flex_group(sbi, block_group); 475 ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
527 spin_lock(sb_bgl_lock(sbi, flex_group)); 476 spin_lock(sb_bgl_lock(sbi, flex_group));
528 sbi->s_flex_groups[flex_group].free_blocks += count; 477 sbi->s_flex_groups[flex_group].free_blocks += blocks_freed;
529 spin_unlock(sb_bgl_lock(sbi, flex_group)); 478 spin_unlock(sb_bgl_lock(sbi, flex_group));
530 } 479 }
480 /*
481 * request to reload the buddy with the
482 * new bitmap information
483 */
484 set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
485 ext4_mb_update_group_info(grp, blocks_freed);
486 up_write(&grp->alloc_sem);
531 487
532 /* We dirtied the bitmap block */ 488 /* We dirtied the bitmap block */
533 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); 489 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
534 err = ext4_journal_dirty_metadata(handle, bitmap_bh); 490 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
535 491
536 /* And the group descriptor block */ 492 /* And the group descriptor block */
537 BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); 493 BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
538 ret = ext4_journal_dirty_metadata(handle, gd_bh); 494 ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
539 if (!err) err = ret; 495 if (!err)
540 *pdquot_freed_blocks += group_freed; 496 err = ret;
541
542 if (overflow && !err) {
543 block += count;
544 count = overflow;
545 goto do_more;
546 }
547 sb->s_dirt = 1; 497 sb->s_dirt = 1;
498
548error_return: 499error_return:
549 brelse(bitmap_bh); 500 brelse(bitmap_bh);
550 ext4_std_error(sb, err); 501 ext4_std_error(sb, err);
@@ -614,7 +565,7 @@ int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
614 if (dirty_blocks < 0) { 565 if (dirty_blocks < 0) {
615 printk(KERN_CRIT "Dirty block accounting " 566 printk(KERN_CRIT "Dirty block accounting "
616 "went wrong %lld\n", 567 "went wrong %lld\n",
617 dirty_blocks); 568 (long long)dirty_blocks);
618 } 569 }
619 } 570 }
620 /* Check whether we have space after 571 /* Check whether we have space after
@@ -666,101 +617,45 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
666 return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal); 617 return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal);
667} 618}
668 619
669#define EXT4_META_BLOCK 0x1
670
671static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode,
672 ext4_lblk_t iblock, ext4_fsblk_t goal,
673 unsigned long *count, int *errp, int flags)
674{
675 struct ext4_allocation_request ar;
676 ext4_fsblk_t ret;
677
678 memset(&ar, 0, sizeof(ar));
679 /* Fill with neighbour allocated blocks */
680
681 ar.inode = inode;
682 ar.goal = goal;
683 ar.len = *count;
684 ar.logical = iblock;
685
686 if (S_ISREG(inode->i_mode) && !(flags & EXT4_META_BLOCK))
687 /* enable in-core preallocation for data block allocation */
688 ar.flags = EXT4_MB_HINT_DATA;
689 else
690 /* disable in-core preallocation for non-regular files */
691 ar.flags = 0;
692
693 ret = ext4_mb_new_blocks(handle, &ar, errp);
694 *count = ar.len;
695 return ret;
696}
697
698/* 620/*
699 * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks 621 * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks
700 * 622 *
701 * @handle: handle to this transaction 623 * @handle: handle to this transaction
702 * @inode: file inode 624 * @inode: file inode
703 * @goal: given target block(filesystem wide) 625 * @goal: given target block(filesystem wide)
704 * @count: total number of blocks need 626 * @count: pointer to total number of blocks needed
705 * @errp: error code 627 * @errp: error code
706 * 628 *
707 * Return 1st allocated block numberon success, *count stores total account 629 * Return 1st allocated block number on success, *count stores total account
708 * error stores in errp pointer 630 * error stores in errp pointer
709 */ 631 */
710ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, 632ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
711 ext4_fsblk_t goal, unsigned long *count, int *errp) 633 ext4_fsblk_t goal, unsigned long *count, int *errp)
712{ 634{
635 struct ext4_allocation_request ar;
713 ext4_fsblk_t ret; 636 ext4_fsblk_t ret;
714 ret = do_blk_alloc(handle, inode, 0, goal, 637
715 count, errp, EXT4_META_BLOCK); 638 memset(&ar, 0, sizeof(ar));
639 /* Fill with neighbour allocated blocks */
640 ar.inode = inode;
641 ar.goal = goal;
642 ar.len = count ? *count : 1;
643
644 ret = ext4_mb_new_blocks(handle, &ar, errp);
645 if (count)
646 *count = ar.len;
647
716 /* 648 /*
717 * Account for the allocated meta blocks 649 * Account for the allocated meta blocks
718 */ 650 */
719 if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) { 651 if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) {
720 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 652 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
721 EXT4_I(inode)->i_allocated_meta_blocks += *count; 653 EXT4_I(inode)->i_allocated_meta_blocks += ar.len;
722 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 654 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
723 } 655 }
724 return ret; 656 return ret;
725} 657}
726 658
727/*
728 * ext4_new_meta_block() -- allocate block for meta data (indexing) blocks
729 *
730 * @handle: handle to this transaction
731 * @inode: file inode
732 * @goal: given target block(filesystem wide)
733 * @errp: error code
734 *
735 * Return allocated block number on success
736 */
737ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
738 ext4_fsblk_t goal, int *errp)
739{
740 unsigned long count = 1;
741 return ext4_new_meta_blocks(handle, inode, goal, &count, errp);
742}
743
744/*
745 * ext4_new_blocks() -- allocate data blocks
746 *
747 * @handle: handle to this transaction
748 * @inode: file inode
749 * @goal: given target block(filesystem wide)
750 * @count: total number of blocks need
751 * @errp: error code
752 *
753 * Return 1st allocated block numberon success, *count stores total account
754 * error stores in errp pointer
755 */
756
757ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
758 ext4_lblk_t iblock, ext4_fsblk_t goal,
759 unsigned long *count, int *errp)
760{
761 return do_blk_alloc(handle, inode, iblock, goal, count, errp, 0);
762}
763
764/** 659/**
765 * ext4_count_free_blocks() -- count filesystem free blocks 660 * ext4_count_free_blocks() -- count filesystem free blocks
766 * @sb: superblock 661 * @sb: superblock
@@ -776,7 +671,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
776#ifdef EXT4FS_DEBUG 671#ifdef EXT4FS_DEBUG
777 struct ext4_super_block *es; 672 struct ext4_super_block *es;
778 ext4_fsblk_t bitmap_count; 673 ext4_fsblk_t bitmap_count;
779 unsigned long x; 674 unsigned int x;
780 struct buffer_head *bitmap_bh = NULL; 675 struct buffer_head *bitmap_bh = NULL;
781 676
782 es = EXT4_SB(sb)->s_es; 677 es = EXT4_SB(sb)->s_es;
@@ -789,15 +684,15 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
789 gdp = ext4_get_group_desc(sb, i, NULL); 684 gdp = ext4_get_group_desc(sb, i, NULL);
790 if (!gdp) 685 if (!gdp)
791 continue; 686 continue;
792 desc_count += le16_to_cpu(gdp->bg_free_blocks_count); 687 desc_count += ext4_free_blks_count(sb, gdp);
793 brelse(bitmap_bh); 688 brelse(bitmap_bh);
794 bitmap_bh = ext4_read_block_bitmap(sb, i); 689 bitmap_bh = ext4_read_block_bitmap(sb, i);
795 if (bitmap_bh == NULL) 690 if (bitmap_bh == NULL)
796 continue; 691 continue;
797 692
798 x = ext4_count_free(bitmap_bh, sb->s_blocksize); 693 x = ext4_count_free(bitmap_bh, sb->s_blocksize);
799 printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n", 694 printk(KERN_DEBUG "group %u: stored = %d, counted = %u\n",
800 i, le16_to_cpu(gdp->bg_free_blocks_count), x); 695 i, ext4_free_blks_count(sb, gdp), x);
801 bitmap_count += x; 696 bitmap_count += x;
802 } 697 }
803 brelse(bitmap_bh); 698 brelse(bitmap_bh);
@@ -812,7 +707,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
812 gdp = ext4_get_group_desc(sb, i, NULL); 707 gdp = ext4_get_group_desc(sb, i, NULL);
813 if (!gdp) 708 if (!gdp)
814 continue; 709 continue;
815 desc_count += le16_to_cpu(gdp->bg_free_blocks_count); 710 desc_count += ext4_free_blks_count(sb, gdp);
816 } 711 }
817 712
818 return desc_count; 713 return desc_count;
diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c
index 0a7a6663c190..fa3af81ac565 100644
--- a/fs/ext4/bitmap.c
+++ b/fs/ext4/bitmap.c
@@ -15,10 +15,9 @@
15 15
16static const int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0}; 16static const int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0};
17 17
18unsigned long ext4_count_free(struct buffer_head *map, unsigned int numchars) 18unsigned int ext4_count_free(struct buffer_head *map, unsigned int numchars)
19{ 19{
20 unsigned int i; 20 unsigned int i, sum = 0;
21 unsigned long sum = 0;
22 21
23 if (!map) 22 if (!map)
24 return 0; 23 return 0;
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index fed5b610df5a..2df2e40b01af 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -64,7 +64,7 @@ static unsigned char get_dtype(struct super_block *sb, int filetype)
64int ext4_check_dir_entry(const char *function, struct inode *dir, 64int ext4_check_dir_entry(const char *function, struct inode *dir,
65 struct ext4_dir_entry_2 *de, 65 struct ext4_dir_entry_2 *de,
66 struct buffer_head *bh, 66 struct buffer_head *bh,
67 unsigned long offset) 67 unsigned int offset)
68{ 68{
69 const char *error_msg = NULL; 69 const char *error_msg = NULL;
70 const int rlen = ext4_rec_len_from_disk(de->rec_len); 70 const int rlen = ext4_rec_len_from_disk(de->rec_len);
@@ -84,9 +84,9 @@ int ext4_check_dir_entry(const char *function, struct inode *dir,
84 if (error_msg != NULL) 84 if (error_msg != NULL)
85 ext4_error(dir->i_sb, function, 85 ext4_error(dir->i_sb, function,
86 "bad entry in directory #%lu: %s - " 86 "bad entry in directory #%lu: %s - "
87 "offset=%lu, inode=%lu, rec_len=%d, name_len=%d", 87 "offset=%u, inode=%u, rec_len=%d, name_len=%d",
88 dir->i_ino, error_msg, offset, 88 dir->i_ino, error_msg, offset,
89 (unsigned long) le32_to_cpu(de->inode), 89 le32_to_cpu(de->inode),
90 rlen, de->name_len); 90 rlen, de->name_len);
91 return error_msg == NULL ? 1 : 0; 91 return error_msg == NULL ? 1 : 0;
92} 92}
@@ -95,7 +95,7 @@ static int ext4_readdir(struct file *filp,
95 void *dirent, filldir_t filldir) 95 void *dirent, filldir_t filldir)
96{ 96{
97 int error = 0; 97 int error = 0;
98 unsigned long offset; 98 unsigned int offset;
99 int i, stored; 99 int i, stored;
100 struct ext4_dir_entry_2 *de; 100 struct ext4_dir_entry_2 *de;
101 struct super_block *sb; 101 struct super_block *sb;
@@ -405,7 +405,7 @@ static int call_filldir(struct file *filp, void *dirent,
405 sb = inode->i_sb; 405 sb = inode->i_sb;
406 406
407 if (!fname) { 407 if (!fname) {
408 printk(KERN_ERR "ext4: call_filldir: called with " 408 printk(KERN_ERR "EXT4-fs: call_filldir: called with "
409 "null fname?!?\n"); 409 "null fname?!?\n");
410 return 0; 410 return 0;
411 } 411 }
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index b0537c827024..aafc9eba1c25 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -19,6 +19,7 @@
19#include <linux/types.h> 19#include <linux/types.h>
20#include <linux/blkdev.h> 20#include <linux/blkdev.h>
21#include <linux/magic.h> 21#include <linux/magic.h>
22#include <linux/jbd2.h>
22#include "ext4_i.h" 23#include "ext4_i.h"
23 24
24/* 25/*
@@ -94,9 +95,9 @@ struct ext4_allocation_request {
94 /* phys. block for ^^^ */ 95 /* phys. block for ^^^ */
95 ext4_fsblk_t pright; 96 ext4_fsblk_t pright;
96 /* how many blocks we want to allocate */ 97 /* how many blocks we want to allocate */
97 unsigned long len; 98 unsigned int len;
98 /* flags. see above EXT4_MB_HINT_* */ 99 /* flags. see above EXT4_MB_HINT_* */
99 unsigned long flags; 100 unsigned int flags;
100}; 101};
101 102
102/* 103/*
@@ -156,12 +157,12 @@ struct ext4_group_desc
156 __le32 bg_block_bitmap_lo; /* Blocks bitmap block */ 157 __le32 bg_block_bitmap_lo; /* Blocks bitmap block */
157 __le32 bg_inode_bitmap_lo; /* Inodes bitmap block */ 158 __le32 bg_inode_bitmap_lo; /* Inodes bitmap block */
158 __le32 bg_inode_table_lo; /* Inodes table block */ 159 __le32 bg_inode_table_lo; /* Inodes table block */
159 __le16 bg_free_blocks_count; /* Free blocks count */ 160 __le16 bg_free_blocks_count_lo;/* Free blocks count */
160 __le16 bg_free_inodes_count; /* Free inodes count */ 161 __le16 bg_free_inodes_count_lo;/* Free inodes count */
161 __le16 bg_used_dirs_count; /* Directories count */ 162 __le16 bg_used_dirs_count_lo; /* Directories count */
162 __le16 bg_flags; /* EXT4_BG_flags (INODE_UNINIT, etc) */ 163 __le16 bg_flags; /* EXT4_BG_flags (INODE_UNINIT, etc) */
163 __u32 bg_reserved[2]; /* Likely block/inode bitmap checksum */ 164 __u32 bg_reserved[2]; /* Likely block/inode bitmap checksum */
164 __le16 bg_itable_unused; /* Unused inodes count */ 165 __le16 bg_itable_unused_lo; /* Unused inodes count */
165 __le16 bg_checksum; /* crc16(sb_uuid+group+desc) */ 166 __le16 bg_checksum; /* crc16(sb_uuid+group+desc) */
166 __le32 bg_block_bitmap_hi; /* Blocks bitmap block MSB */ 167 __le32 bg_block_bitmap_hi; /* Blocks bitmap block MSB */
167 __le32 bg_inode_bitmap_hi; /* Inodes bitmap block MSB */ 168 __le32 bg_inode_bitmap_hi; /* Inodes bitmap block MSB */
@@ -169,7 +170,7 @@ struct ext4_group_desc
169 __le16 bg_free_blocks_count_hi;/* Free blocks count MSB */ 170 __le16 bg_free_blocks_count_hi;/* Free blocks count MSB */
170 __le16 bg_free_inodes_count_hi;/* Free inodes count MSB */ 171 __le16 bg_free_inodes_count_hi;/* Free inodes count MSB */
171 __le16 bg_used_dirs_count_hi; /* Directories count MSB */ 172 __le16 bg_used_dirs_count_hi; /* Directories count MSB */
172 __le16 bg_itable_unused_hi; /* Unused inodes count MSB */ 173 __le16 bg_itable_unused_hi; /* Unused inodes count MSB */
173 __u32 bg_reserved2[3]; 174 __u32 bg_reserved2[3];
174}; 175};
175 176
@@ -328,6 +329,7 @@ struct ext4_mount_options {
328 uid_t s_resuid; 329 uid_t s_resuid;
329 gid_t s_resgid; 330 gid_t s_resgid;
330 unsigned long s_commit_interval; 331 unsigned long s_commit_interval;
332 u32 s_min_batch_time, s_max_batch_time;
331#ifdef CONFIG_QUOTA 333#ifdef CONFIG_QUOTA
332 int s_jquota_fmt; 334 int s_jquota_fmt;
333 char *s_qf_names[MAXQUOTAS]; 335 char *s_qf_names[MAXQUOTAS];
@@ -534,7 +536,6 @@ do { \
534#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */ 536#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */
535#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ 537#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */
536#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ 538#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */
537#define EXT4_MOUNT_EXTENTS 0x400000 /* Extents support */
538#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ 539#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
539#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ 540#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
540#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ 541#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
@@ -726,11 +727,11 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
726 */ 727 */
727 728
728#define EXT4_HAS_COMPAT_FEATURE(sb,mask) \ 729#define EXT4_HAS_COMPAT_FEATURE(sb,mask) \
729 (EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask)) 730 ((EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask)) != 0)
730#define EXT4_HAS_RO_COMPAT_FEATURE(sb,mask) \ 731#define EXT4_HAS_RO_COMPAT_FEATURE(sb,mask) \
731 (EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask)) 732 ((EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask)) != 0)
732#define EXT4_HAS_INCOMPAT_FEATURE(sb,mask) \ 733#define EXT4_HAS_INCOMPAT_FEATURE(sb,mask) \
733 (EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask)) 734 ((EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask)) != 0)
734#define EXT4_SET_COMPAT_FEATURE(sb,mask) \ 735#define EXT4_SET_COMPAT_FEATURE(sb,mask) \
735 EXT4_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask) 736 EXT4_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask)
736#define EXT4_SET_RO_COMPAT_FEATURE(sb,mask) \ 737#define EXT4_SET_RO_COMPAT_FEATURE(sb,mask) \
@@ -806,6 +807,12 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
806#define EXT4_DEFM_JMODE_WBACK 0x0060 807#define EXT4_DEFM_JMODE_WBACK 0x0060
807 808
808/* 809/*
810 * Default journal batch times
811 */
812#define EXT4_DEF_MIN_BATCH_TIME 0
813#define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */
814
815/*
809 * Structure of a directory entry 816 * Structure of a directory entry
810 */ 817 */
811#define EXT4_NAME_LEN 255 818#define EXT4_NAME_LEN 255
@@ -891,6 +898,9 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len)
891#define DX_HASH_LEGACY 0 898#define DX_HASH_LEGACY 0
892#define DX_HASH_HALF_MD4 1 899#define DX_HASH_HALF_MD4 1
893#define DX_HASH_TEA 2 900#define DX_HASH_TEA 2
901#define DX_HASH_LEGACY_UNSIGNED 3
902#define DX_HASH_HALF_MD4_UNSIGNED 4
903#define DX_HASH_TEA_UNSIGNED 5
894 904
895#ifdef __KERNEL__ 905#ifdef __KERNEL__
896 906
@@ -955,7 +965,7 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
955#define ERR_BAD_DX_DIR -75000 965#define ERR_BAD_DX_DIR -75000
956 966
957void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, 967void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
958 unsigned long *blockgrpp, ext4_grpblk_t *offsetp); 968 ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp);
959 969
960extern struct proc_dir_entry *ext4_proc_root; 970extern struct proc_dir_entry *ext4_proc_root;
961 971
@@ -987,6 +997,9 @@ do { \
987# define ATTRIB_NORET __attribute__((noreturn)) 997# define ATTRIB_NORET __attribute__((noreturn))
988# define NORET_AND noreturn, 998# define NORET_AND noreturn,
989 999
1000/* bitmap.c */
1001extern unsigned int ext4_count_free(struct buffer_head *, unsigned);
1002
990/* balloc.c */ 1003/* balloc.c */
991extern unsigned int ext4_block_group(struct super_block *sb, 1004extern unsigned int ext4_block_group(struct super_block *sb,
992 ext4_fsblk_t blocknr); 1005 ext4_fsblk_t blocknr);
@@ -995,20 +1008,14 @@ extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb,
995extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group); 1008extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group);
996extern unsigned long ext4_bg_num_gdb(struct super_block *sb, 1009extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
997 ext4_group_t group); 1010 ext4_group_t group);
998extern ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
999 ext4_fsblk_t goal, int *errp);
1000extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, 1011extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
1001 ext4_fsblk_t goal, unsigned long *count, int *errp); 1012 ext4_fsblk_t goal, unsigned long *count, int *errp);
1002extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
1003 ext4_lblk_t iblock, ext4_fsblk_t goal,
1004 unsigned long *count, int *errp);
1005extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks); 1013extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
1006extern int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks); 1014extern int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
1007extern void ext4_free_blocks(handle_t *handle, struct inode *inode, 1015extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
1008 ext4_fsblk_t block, unsigned long count, int metadata); 1016 ext4_fsblk_t block, unsigned long count, int metadata);
1009extern void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb, 1017extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
1010 ext4_fsblk_t block, unsigned long count, 1018 ext4_fsblk_t block, unsigned long count);
1011 unsigned long *pdquot_freed_blocks);
1012extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *); 1019extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *);
1013extern void ext4_check_blocks_bitmap(struct super_block *); 1020extern void ext4_check_blocks_bitmap(struct super_block *);
1014extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, 1021extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
@@ -1019,7 +1026,7 @@ extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
1019/* dir.c */ 1026/* dir.c */
1020extern int ext4_check_dir_entry(const char *, struct inode *, 1027extern int ext4_check_dir_entry(const char *, struct inode *,
1021 struct ext4_dir_entry_2 *, 1028 struct ext4_dir_entry_2 *,
1022 struct buffer_head *, unsigned long); 1029 struct buffer_head *, unsigned int);
1023extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, 1030extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
1024 __u32 minor_hash, 1031 __u32 minor_hash,
1025 struct ext4_dir_entry_2 *dirent); 1032 struct ext4_dir_entry_2 *dirent);
@@ -1039,7 +1046,6 @@ extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
1039extern unsigned long ext4_count_free_inodes(struct super_block *); 1046extern unsigned long ext4_count_free_inodes(struct super_block *);
1040extern unsigned long ext4_count_dirs(struct super_block *); 1047extern unsigned long ext4_count_dirs(struct super_block *);
1041extern void ext4_check_inodes_bitmap(struct super_block *); 1048extern void ext4_check_inodes_bitmap(struct super_block *);
1042extern unsigned long ext4_count_free(struct buffer_head *, unsigned);
1043 1049
1044/* mballoc.c */ 1050/* mballoc.c */
1045extern long ext4_mb_stats; 1051extern long ext4_mb_stats;
@@ -1054,12 +1060,13 @@ extern int __init init_ext4_mballoc(void);
1054extern void exit_ext4_mballoc(void); 1060extern void exit_ext4_mballoc(void);
1055extern void ext4_mb_free_blocks(handle_t *, struct inode *, 1061extern void ext4_mb_free_blocks(handle_t *, struct inode *,
1056 unsigned long, unsigned long, int, unsigned long *); 1062 unsigned long, unsigned long, int, unsigned long *);
1057extern int ext4_mb_add_more_groupinfo(struct super_block *sb, 1063extern int ext4_mb_add_groupinfo(struct super_block *sb,
1058 ext4_group_t i, struct ext4_group_desc *desc); 1064 ext4_group_t i, struct ext4_group_desc *desc);
1059extern void ext4_mb_update_group_info(struct ext4_group_info *grp, 1065extern void ext4_mb_update_group_info(struct ext4_group_info *grp,
1060 ext4_grpblk_t add); 1066 ext4_grpblk_t add);
1061 1067extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t);
1062 1068extern void ext4_mb_put_buddy_cache_lock(struct super_block *,
1069 ext4_group_t, int);
1063/* inode.c */ 1070/* inode.c */
1064int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, 1071int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
1065 struct buffer_head *bh, ext4_fsblk_t blocknr); 1072 struct buffer_head *bh, ext4_fsblk_t blocknr);
@@ -1069,10 +1076,6 @@ struct buffer_head *ext4_bread(handle_t *, struct inode *,
1069 ext4_lblk_t, int, int *); 1076 ext4_lblk_t, int, int *);
1070int ext4_get_block(struct inode *inode, sector_t iblock, 1077int ext4_get_block(struct inode *inode, sector_t iblock,
1071 struct buffer_head *bh_result, int create); 1078 struct buffer_head *bh_result, int create);
1072int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
1073 ext4_lblk_t iblock, unsigned long maxblocks,
1074 struct buffer_head *bh_result,
1075 int create, int extend_disksize);
1076 1079
1077extern struct inode *ext4_iget(struct super_block *, unsigned long); 1080extern struct inode *ext4_iget(struct super_block *, unsigned long);
1078extern int ext4_write_inode(struct inode *, int); 1081extern int ext4_write_inode(struct inode *, int);
@@ -1123,6 +1126,9 @@ extern void ext4_abort(struct super_block *, const char *, const char *, ...)
1123 __attribute__ ((format (printf, 3, 4))); 1126 __attribute__ ((format (printf, 3, 4)));
1124extern void ext4_warning(struct super_block *, const char *, const char *, ...) 1127extern void ext4_warning(struct super_block *, const char *, const char *, ...)
1125 __attribute__ ((format (printf, 3, 4))); 1128 __attribute__ ((format (printf, 3, 4)));
1129extern void ext4_grp_locked_error(struct super_block *, ext4_group_t,
1130 const char *, const char *, ...)
1131 __attribute__ ((format (printf, 4, 5)));
1126extern void ext4_update_dynamic_rev(struct super_block *sb); 1132extern void ext4_update_dynamic_rev(struct super_block *sb);
1127extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb, 1133extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb,
1128 __u32 compat); 1134 __u32 compat);
@@ -1136,12 +1142,28 @@ extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
1136 struct ext4_group_desc *bg); 1142 struct ext4_group_desc *bg);
1137extern ext4_fsblk_t ext4_inode_table(struct super_block *sb, 1143extern ext4_fsblk_t ext4_inode_table(struct super_block *sb,
1138 struct ext4_group_desc *bg); 1144 struct ext4_group_desc *bg);
1145extern __u32 ext4_free_blks_count(struct super_block *sb,
1146 struct ext4_group_desc *bg);
1147extern __u32 ext4_free_inodes_count(struct super_block *sb,
1148 struct ext4_group_desc *bg);
1149extern __u32 ext4_used_dirs_count(struct super_block *sb,
1150 struct ext4_group_desc *bg);
1151extern __u32 ext4_itable_unused_count(struct super_block *sb,
1152 struct ext4_group_desc *bg);
1139extern void ext4_block_bitmap_set(struct super_block *sb, 1153extern void ext4_block_bitmap_set(struct super_block *sb,
1140 struct ext4_group_desc *bg, ext4_fsblk_t blk); 1154 struct ext4_group_desc *bg, ext4_fsblk_t blk);
1141extern void ext4_inode_bitmap_set(struct super_block *sb, 1155extern void ext4_inode_bitmap_set(struct super_block *sb,
1142 struct ext4_group_desc *bg, ext4_fsblk_t blk); 1156 struct ext4_group_desc *bg, ext4_fsblk_t blk);
1143extern void ext4_inode_table_set(struct super_block *sb, 1157extern void ext4_inode_table_set(struct super_block *sb,
1144 struct ext4_group_desc *bg, ext4_fsblk_t blk); 1158 struct ext4_group_desc *bg, ext4_fsblk_t blk);
1159extern void ext4_free_blks_set(struct super_block *sb,
1160 struct ext4_group_desc *bg, __u32 count);
1161extern void ext4_free_inodes_set(struct super_block *sb,
1162 struct ext4_group_desc *bg, __u32 count);
1163extern void ext4_used_dirs_set(struct super_block *sb,
1164 struct ext4_group_desc *bg, __u32 count);
1165extern void ext4_itable_unused_set(struct super_block *sb,
1166 struct ext4_group_desc *bg, __u32 count);
1145 1167
1146static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es) 1168static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
1147{ 1169{
@@ -1184,8 +1206,11 @@ static inline void ext4_r_blocks_count_set(struct ext4_super_block *es,
1184 1206
1185static inline loff_t ext4_isize(struct ext4_inode *raw_inode) 1207static inline loff_t ext4_isize(struct ext4_inode *raw_inode)
1186{ 1208{
1187 return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) | 1209 if (S_ISREG(le16_to_cpu(raw_inode->i_mode)))
1188 le32_to_cpu(raw_inode->i_size_lo); 1210 return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
1211 le32_to_cpu(raw_inode->i_size_lo);
1212 else
1213 return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
1189} 1214}
1190 1215
1191static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size) 1216static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
@@ -1225,11 +1250,11 @@ do { \
1225} while (0) 1250} while (0)
1226 1251
1227#ifdef CONFIG_SMP 1252#ifdef CONFIG_SMP
1228/* Each CPU can accumulate FBC_BATCH blocks in their local 1253/* Each CPU can accumulate percpu_counter_batch blocks in their local
1229 * counters. So we need to make sure we have free blocks more 1254 * counters. So we need to make sure we have free blocks more
1230 * than FBC_BATCH * nr_cpu_ids. Also add a window of 4 times. 1255 * than percpu_counter_batch * nr_cpu_ids. Also add a window of 4 times.
1231 */ 1256 */
1232#define EXT4_FREEBLOCKS_WATERMARK (4 * (FBC_BATCH * nr_cpu_ids)) 1257#define EXT4_FREEBLOCKS_WATERMARK (4 * (percpu_counter_batch * nr_cpu_ids))
1233#else 1258#else
1234#define EXT4_FREEBLOCKS_WATERMARK 0 1259#define EXT4_FREEBLOCKS_WATERMARK 0
1235#endif 1260#endif
@@ -1246,6 +1271,50 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
1246 return ; 1271 return ;
1247} 1272}
1248 1273
1274struct ext4_group_info {
1275 unsigned long bb_state;
1276 struct rb_root bb_free_root;
1277 unsigned short bb_first_free;
1278 unsigned short bb_free;
1279 unsigned short bb_fragments;
1280 struct list_head bb_prealloc_list;
1281#ifdef DOUBLE_CHECK
1282 void *bb_bitmap;
1283#endif
1284 struct rw_semaphore alloc_sem;
1285 unsigned short bb_counters[];
1286};
1287
1288#define EXT4_GROUP_INFO_NEED_INIT_BIT 0
1289#define EXT4_GROUP_INFO_LOCKED_BIT 1
1290
1291#define EXT4_MB_GRP_NEED_INIT(grp) \
1292 (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
1293
1294static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
1295{
1296 struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
1297
1298 bit_spin_lock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
1299}
1300
1301static inline void ext4_unlock_group(struct super_block *sb,
1302 ext4_group_t group)
1303{
1304 struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
1305
1306 bit_spin_unlock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
1307}
1308
1309static inline int ext4_is_group_locked(struct super_block *sb,
1310 ext4_group_t group)
1311{
1312 struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
1313
1314 return bit_spin_is_locked(EXT4_GROUP_INFO_LOCKED_BIT,
1315 &(grinfo->bb_state));
1316}
1317
1249/* 1318/*
1250 * Inodes and files operations 1319 * Inodes and files operations
1251 */ 1320 */
@@ -1271,18 +1340,38 @@ extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
1271extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, 1340extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
1272 int chunk); 1341 int chunk);
1273extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, 1342extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
1274 ext4_lblk_t iblock, 1343 ext4_lblk_t iblock, unsigned int max_blocks,
1275 unsigned long max_blocks, struct buffer_head *bh_result, 1344 struct buffer_head *bh_result,
1276 int create, int extend_disksize); 1345 int create, int extend_disksize);
1277extern void ext4_ext_truncate(struct inode *); 1346extern void ext4_ext_truncate(struct inode *);
1278extern void ext4_ext_init(struct super_block *); 1347extern void ext4_ext_init(struct super_block *);
1279extern void ext4_ext_release(struct super_block *); 1348extern void ext4_ext_release(struct super_block *);
1280extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, 1349extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
1281 loff_t len); 1350 loff_t len);
1282extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, 1351extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode,
1283 sector_t block, unsigned long max_blocks, 1352 sector_t block, unsigned int max_blocks,
1284 struct buffer_head *bh, int create, 1353 struct buffer_head *bh, int create,
1285 int extend_disksize, int flag); 1354 int extend_disksize, int flag);
1355extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
1356 __u64 start, __u64 len);
1357
1358/*
1359 * Add new method to test wether block and inode bitmaps are properly
1360 * initialized. With uninit_bg reading the block from disk is not enough
1361 * to mark the bitmap uptodate. We need to also zero-out the bitmap
1362 */
1363#define BH_BITMAP_UPTODATE BH_JBDPrivateStart
1364
1365static inline int bitmap_uptodate(struct buffer_head *bh)
1366{
1367 return (buffer_uptodate(bh) &&
1368 test_bit(BH_BITMAP_UPTODATE, &(bh)->b_state));
1369}
1370static inline void set_bitmap_uptodate(struct buffer_head *bh)
1371{
1372 set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state);
1373}
1374
1286#endif /* __KERNEL__ */ 1375#endif /* __KERNEL__ */
1287 1376
1288#endif /* _EXT4_H */ 1377#endif /* _EXT4_H */
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index bec7ce59fc0d..18cb67b2cbbc 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -194,11 +194,6 @@ static inline unsigned short ext_depth(struct inode *inode)
194 return le16_to_cpu(ext_inode_hdr(inode)->eh_depth); 194 return le16_to_cpu(ext_inode_hdr(inode)->eh_depth);
195} 195}
196 196
197static inline void ext4_ext_tree_changed(struct inode *inode)
198{
199 EXT4_I(inode)->i_ext_generation++;
200}
201
202static inline void 197static inline void
203ext4_ext_invalidate_cache(struct inode *inode) 198ext4_ext_invalidate_cache(struct inode *inode)
204{ 199{
diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
index 5c124c0ac6d3..e69acc16f5c4 100644
--- a/fs/ext4/ext4_i.h
+++ b/fs/ext4/ext4_i.h
@@ -31,7 +31,7 @@ typedef unsigned long long ext4_fsblk_t;
31typedef __u32 ext4_lblk_t; 31typedef __u32 ext4_lblk_t;
32 32
33/* data type for block group number */ 33/* data type for block group number */
34typedef unsigned long ext4_group_t; 34typedef unsigned int ext4_group_t;
35 35
36#define rsv_start rsv_window._rsv_start 36#define rsv_start rsv_window._rsv_start
37#define rsv_end rsv_window._rsv_end 37#define rsv_end rsv_window._rsv_end
@@ -100,9 +100,6 @@ struct ext4_inode_info {
100 */ 100 */
101 loff_t i_disksize; 101 loff_t i_disksize;
102 102
103 /* on-disk additional length */
104 __u16 i_extra_isize;
105
106 /* 103 /*
107 * i_data_sem is for serialising ext4_truncate() against 104 * i_data_sem is for serialising ext4_truncate() against
108 * ext4_getblock(). In the 2.4 ext2 design, great chunks of inode's 105 * ext4_getblock(). In the 2.4 ext2 design, great chunks of inode's
@@ -117,7 +114,6 @@ struct ext4_inode_info {
117 struct inode vfs_inode; 114 struct inode vfs_inode;
118 struct jbd2_inode jinode; 115 struct jbd2_inode jinode;
119 116
120 unsigned long i_ext_generation;
121 struct ext4_ext_cache i_cached_extent; 117 struct ext4_ext_cache i_cached_extent;
122 /* 118 /*
123 * File creation time. Its function is same as that of 119 * File creation time. Its function is same as that of
@@ -130,10 +126,14 @@ struct ext4_inode_info {
130 spinlock_t i_prealloc_lock; 126 spinlock_t i_prealloc_lock;
131 127
132 /* allocation reservation info for delalloc */ 128 /* allocation reservation info for delalloc */
133 unsigned long i_reserved_data_blocks; 129 unsigned int i_reserved_data_blocks;
134 unsigned long i_reserved_meta_blocks; 130 unsigned int i_reserved_meta_blocks;
135 unsigned long i_allocated_meta_blocks; 131 unsigned int i_allocated_meta_blocks;
136 unsigned short i_delalloc_reserved_flag; 132 unsigned short i_delalloc_reserved_flag;
133
134 /* on-disk additional length */
135 __u16 i_extra_isize;
136
137 spinlock_t i_block_reservation_lock; 137 spinlock_t i_block_reservation_lock;
138}; 138};
139 139
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index c75384b34f2c..ad13a84644e1 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -7,53 +7,96 @@
7int __ext4_journal_get_undo_access(const char *where, handle_t *handle, 7int __ext4_journal_get_undo_access(const char *where, handle_t *handle,
8 struct buffer_head *bh) 8 struct buffer_head *bh)
9{ 9{
10 int err = jbd2_journal_get_undo_access(handle, bh); 10 int err = 0;
11 if (err) 11
12 ext4_journal_abort_handle(where, __func__, bh, handle, err); 12 if (ext4_handle_valid(handle)) {
13 err = jbd2_journal_get_undo_access(handle, bh);
14 if (err)
15 ext4_journal_abort_handle(where, __func__, bh,
16 handle, err);
17 }
13 return err; 18 return err;
14} 19}
15 20
16int __ext4_journal_get_write_access(const char *where, handle_t *handle, 21int __ext4_journal_get_write_access(const char *where, handle_t *handle,
17 struct buffer_head *bh) 22 struct buffer_head *bh)
18{ 23{
19 int err = jbd2_journal_get_write_access(handle, bh); 24 int err = 0;
20 if (err) 25
21 ext4_journal_abort_handle(where, __func__, bh, handle, err); 26 if (ext4_handle_valid(handle)) {
27 err = jbd2_journal_get_write_access(handle, bh);
28 if (err)
29 ext4_journal_abort_handle(where, __func__, bh,
30 handle, err);
31 }
22 return err; 32 return err;
23} 33}
24 34
25int __ext4_journal_forget(const char *where, handle_t *handle, 35int __ext4_journal_forget(const char *where, handle_t *handle,
26 struct buffer_head *bh) 36 struct buffer_head *bh)
27{ 37{
28 int err = jbd2_journal_forget(handle, bh); 38 int err = 0;
29 if (err) 39
30 ext4_journal_abort_handle(where, __func__, bh, handle, err); 40 if (ext4_handle_valid(handle)) {
41 err = jbd2_journal_forget(handle, bh);
42 if (err)
43 ext4_journal_abort_handle(where, __func__, bh,
44 handle, err);
45 }
31 return err; 46 return err;
32} 47}
33 48
34int __ext4_journal_revoke(const char *where, handle_t *handle, 49int __ext4_journal_revoke(const char *where, handle_t *handle,
35 ext4_fsblk_t blocknr, struct buffer_head *bh) 50 ext4_fsblk_t blocknr, struct buffer_head *bh)
36{ 51{
37 int err = jbd2_journal_revoke(handle, blocknr, bh); 52 int err = 0;
38 if (err) 53
39 ext4_journal_abort_handle(where, __func__, bh, handle, err); 54 if (ext4_handle_valid(handle)) {
55 err = jbd2_journal_revoke(handle, blocknr, bh);
56 if (err)
57 ext4_journal_abort_handle(where, __func__, bh,
58 handle, err);
59 }
40 return err; 60 return err;
41} 61}
42 62
43int __ext4_journal_get_create_access(const char *where, 63int __ext4_journal_get_create_access(const char *where,
44 handle_t *handle, struct buffer_head *bh) 64 handle_t *handle, struct buffer_head *bh)
45{ 65{
46 int err = jbd2_journal_get_create_access(handle, bh); 66 int err = 0;
47 if (err) 67
48 ext4_journal_abort_handle(where, __func__, bh, handle, err); 68 if (ext4_handle_valid(handle)) {
69 err = jbd2_journal_get_create_access(handle, bh);
70 if (err)
71 ext4_journal_abort_handle(where, __func__, bh,
72 handle, err);
73 }
49 return err; 74 return err;
50} 75}
51 76
52int __ext4_journal_dirty_metadata(const char *where, 77int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
53 handle_t *handle, struct buffer_head *bh) 78 struct inode *inode, struct buffer_head *bh)
54{ 79{
55 int err = jbd2_journal_dirty_metadata(handle, bh); 80 int err = 0;
56 if (err) 81
57 ext4_journal_abort_handle(where, __func__, bh, handle, err); 82 if (ext4_handle_valid(handle)) {
83 err = jbd2_journal_dirty_metadata(handle, bh);
84 if (err)
85 ext4_journal_abort_handle(where, __func__, bh,
86 handle, err);
87 } else {
88 mark_buffer_dirty(bh);
89 if (inode && inode_needs_sync(inode)) {
90 sync_dirty_buffer(bh);
91 if (buffer_req(bh) && !buffer_uptodate(bh)) {
92 ext4_error(inode->i_sb, __func__,
93 "IO error syncing inode, "
94 "inode=%lu, block=%llu",
95 inode->i_ino,
96 (unsigned long long) bh->b_blocknr);
97 err = -EIO;
98 }
99 }
100 }
58 return err; 101 return err;
59} 102}
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index b455c685a98b..be2f426f6805 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -32,8 +32,8 @@
32 * 5 levels of tree + root which are stored in the inode. */ 32 * 5 levels of tree + root which are stored in the inode. */
33 33
34#define EXT4_SINGLEDATA_TRANS_BLOCKS(sb) \ 34#define EXT4_SINGLEDATA_TRANS_BLOCKS(sb) \
35 (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \ 35 (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \
36 || test_opt(sb, EXTENTS) ? 27U : 8U) 36 ? 27U : 8U)
37 37
38/* Extended attribute operations touch at most two data buffers, 38/* Extended attribute operations touch at most two data buffers,
39 * two bitmap buffers, and two group summaries, in addition to the inode 39 * two bitmap buffers, and two group summaries, in addition to the inode
@@ -122,12 +122,6 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode);
122 * been done yet. 122 * been done yet.
123 */ 123 */
124 124
125static inline void ext4_journal_release_buffer(handle_t *handle,
126 struct buffer_head *bh)
127{
128 jbd2_journal_release_buffer(handle, bh);
129}
130
131void ext4_journal_abort_handle(const char *caller, const char *err_fn, 125void ext4_journal_abort_handle(const char *caller, const char *err_fn,
132 struct buffer_head *bh, handle_t *handle, int err); 126 struct buffer_head *bh, handle_t *handle, int err);
133 127
@@ -146,8 +140,8 @@ int __ext4_journal_revoke(const char *where, handle_t *handle,
146int __ext4_journal_get_create_access(const char *where, 140int __ext4_journal_get_create_access(const char *where,
147 handle_t *handle, struct buffer_head *bh); 141 handle_t *handle, struct buffer_head *bh);
148 142
149int __ext4_journal_dirty_metadata(const char *where, 143int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
150 handle_t *handle, struct buffer_head *bh); 144 struct inode *inode, struct buffer_head *bh);
151 145
152#define ext4_journal_get_undo_access(handle, bh) \ 146#define ext4_journal_get_undo_access(handle, bh) \
153 __ext4_journal_get_undo_access(__func__, (handle), (bh)) 147 __ext4_journal_get_undo_access(__func__, (handle), (bh))
@@ -157,14 +151,57 @@ int __ext4_journal_dirty_metadata(const char *where,
157 __ext4_journal_revoke(__func__, (handle), (blocknr), (bh)) 151 __ext4_journal_revoke(__func__, (handle), (blocknr), (bh))
158#define ext4_journal_get_create_access(handle, bh) \ 152#define ext4_journal_get_create_access(handle, bh) \
159 __ext4_journal_get_create_access(__func__, (handle), (bh)) 153 __ext4_journal_get_create_access(__func__, (handle), (bh))
160#define ext4_journal_dirty_metadata(handle, bh) \
161 __ext4_journal_dirty_metadata(__func__, (handle), (bh))
162#define ext4_journal_forget(handle, bh) \ 154#define ext4_journal_forget(handle, bh) \
163 __ext4_journal_forget(__func__, (handle), (bh)) 155 __ext4_journal_forget(__func__, (handle), (bh))
156#define ext4_handle_dirty_metadata(handle, inode, bh) \
157 __ext4_handle_dirty_metadata(__func__, (handle), (inode), (bh))
164 158
165handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks); 159handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
166int __ext4_journal_stop(const char *where, handle_t *handle); 160int __ext4_journal_stop(const char *where, handle_t *handle);
167 161
162#define EXT4_NOJOURNAL_HANDLE ((handle_t *) 0x1)
163
164static inline int ext4_handle_valid(handle_t *handle)
165{
166 if (handle == EXT4_NOJOURNAL_HANDLE)
167 return 0;
168 return 1;
169}
170
171static inline void ext4_handle_sync(handle_t *handle)
172{
173 if (ext4_handle_valid(handle))
174 handle->h_sync = 1;
175}
176
177static inline void ext4_handle_release_buffer(handle_t *handle,
178 struct buffer_head *bh)
179{
180 if (ext4_handle_valid(handle))
181 jbd2_journal_release_buffer(handle, bh);
182}
183
184static inline int ext4_handle_is_aborted(handle_t *handle)
185{
186 if (ext4_handle_valid(handle))
187 return is_handle_aborted(handle);
188 return 0;
189}
190
191static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed)
192{
193 if (ext4_handle_valid(handle) && handle->h_buffer_credits < needed)
194 return 0;
195 return 1;
196}
197
198static inline void ext4_journal_release_buffer(handle_t *handle,
199 struct buffer_head *bh)
200{
201 if (ext4_handle_valid(handle))
202 jbd2_journal_release_buffer(handle, bh);
203}
204
168static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks) 205static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks)
169{ 206{
170 return ext4_journal_start_sb(inode->i_sb, nblocks); 207 return ext4_journal_start_sb(inode->i_sb, nblocks);
@@ -180,27 +217,37 @@ static inline handle_t *ext4_journal_current_handle(void)
180 217
181static inline int ext4_journal_extend(handle_t *handle, int nblocks) 218static inline int ext4_journal_extend(handle_t *handle, int nblocks)
182{ 219{
183 return jbd2_journal_extend(handle, nblocks); 220 if (ext4_handle_valid(handle))
221 return jbd2_journal_extend(handle, nblocks);
222 return 0;
184} 223}
185 224
186static inline int ext4_journal_restart(handle_t *handle, int nblocks) 225static inline int ext4_journal_restart(handle_t *handle, int nblocks)
187{ 226{
188 return jbd2_journal_restart(handle, nblocks); 227 if (ext4_handle_valid(handle))
228 return jbd2_journal_restart(handle, nblocks);
229 return 0;
189} 230}
190 231
191static inline int ext4_journal_blocks_per_page(struct inode *inode) 232static inline int ext4_journal_blocks_per_page(struct inode *inode)
192{ 233{
193 return jbd2_journal_blocks_per_page(inode); 234 if (EXT4_JOURNAL(inode) != NULL)
235 return jbd2_journal_blocks_per_page(inode);
236 return 0;
194} 237}
195 238
196static inline int ext4_journal_force_commit(journal_t *journal) 239static inline int ext4_journal_force_commit(journal_t *journal)
197{ 240{
198 return jbd2_journal_force_commit(journal); 241 if (journal)
242 return jbd2_journal_force_commit(journal);
243 return 0;
199} 244}
200 245
201static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode) 246static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
202{ 247{
203 return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode); 248 if (ext4_handle_valid(handle))
249 return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode);
250 return 0;
204} 251}
205 252
206/* super.c */ 253/* super.c */
@@ -208,6 +255,8 @@ int ext4_force_commit(struct super_block *sb);
208 255
209static inline int ext4_should_journal_data(struct inode *inode) 256static inline int ext4_should_journal_data(struct inode *inode)
210{ 257{
258 if (EXT4_JOURNAL(inode) == NULL)
259 return 0;
211 if (!S_ISREG(inode->i_mode)) 260 if (!S_ISREG(inode->i_mode))
212 return 1; 261 return 1;
213 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) 262 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
@@ -219,6 +268,8 @@ static inline int ext4_should_journal_data(struct inode *inode)
219 268
220static inline int ext4_should_order_data(struct inode *inode) 269static inline int ext4_should_order_data(struct inode *inode)
221{ 270{
271 if (EXT4_JOURNAL(inode) == NULL)
272 return 0;
222 if (!S_ISREG(inode->i_mode)) 273 if (!S_ISREG(inode->i_mode))
223 return 0; 274 return 0;
224 if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) 275 if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
@@ -230,6 +281,8 @@ static inline int ext4_should_order_data(struct inode *inode)
230 281
231static inline int ext4_should_writeback_data(struct inode *inode) 282static inline int ext4_should_writeback_data(struct inode *inode)
232{ 283{
284 if (EXT4_JOURNAL(inode) == NULL)
285 return 0;
233 if (!S_ISREG(inode->i_mode)) 286 if (!S_ISREG(inode->i_mode))
234 return 0; 287 return 0;
235 if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) 288 if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 445fde603df8..039b6ea1a042 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -57,6 +57,7 @@ struct ext4_sb_info {
57 u32 s_next_generation; 57 u32 s_next_generation;
58 u32 s_hash_seed[4]; 58 u32 s_hash_seed[4];
59 int s_def_hash_version; 59 int s_def_hash_version;
60 int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */
60 struct percpu_counter s_freeblocks_counter; 61 struct percpu_counter s_freeblocks_counter;
61 struct percpu_counter s_freeinodes_counter; 62 struct percpu_counter s_freeinodes_counter;
62 struct percpu_counter s_dirs_counter; 63 struct percpu_counter s_dirs_counter;
@@ -73,6 +74,8 @@ struct ext4_sb_info {
73 struct journal_s *s_journal; 74 struct journal_s *s_journal;
74 struct list_head s_orphan; 75 struct list_head s_orphan;
75 unsigned long s_commit_interval; 76 unsigned long s_commit_interval;
77 u32 s_max_batch_time;
78 u32 s_min_batch_time;
76 struct block_device *journal_bdev; 79 struct block_device *journal_bdev;
77#ifdef CONFIG_JBD2_DEBUG 80#ifdef CONFIG_JBD2_DEBUG
78 struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */ 81 struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */
@@ -101,7 +104,8 @@ struct ext4_sb_info {
101 spinlock_t s_reserve_lock; 104 spinlock_t s_reserve_lock;
102 spinlock_t s_md_lock; 105 spinlock_t s_md_lock;
103 tid_t s_last_transaction; 106 tid_t s_last_transaction;
104 unsigned short *s_mb_offsets, *s_mb_maxs; 107 unsigned short *s_mb_offsets;
108 unsigned int *s_mb_maxs;
105 109
106 /* tunables */ 110 /* tunables */
107 unsigned long s_stripe; 111 unsigned long s_stripe;
@@ -146,4 +150,10 @@ struct ext4_sb_info {
146 struct flex_groups *s_flex_groups; 150 struct flex_groups *s_flex_groups;
147}; 151};
148 152
153static inline spinlock_t *
154sb_bgl_lock(struct ext4_sb_info *sbi, unsigned int block_group)
155{
156 return bgl_lock_ptr(&sbi->s_blockgroup_lock, block_group);
157}
158
149#endif /* _EXT4_SB */ 159#endif /* _EXT4_SB */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index ea2ce3c0ae66..e2eab196875f 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -97,6 +97,8 @@ static int ext4_ext_journal_restart(handle_t *handle, int needed)
97{ 97{
98 int err; 98 int err;
99 99
100 if (!ext4_handle_valid(handle))
101 return 0;
100 if (handle->h_buffer_credits > needed) 102 if (handle->h_buffer_credits > needed)
101 return 0; 103 return 0;
102 err = ext4_journal_extend(handle, needed); 104 err = ext4_journal_extend(handle, needed);
@@ -134,7 +136,7 @@ static int ext4_ext_dirty(handle_t *handle, struct inode *inode,
134 int err; 136 int err;
135 if (path->p_bh) { 137 if (path->p_bh) {
136 /* path points to block */ 138 /* path points to block */
137 err = ext4_journal_dirty_metadata(handle, path->p_bh); 139 err = ext4_handle_dirty_metadata(handle, inode, path->p_bh);
138 } else { 140 } else {
139 /* path points to leaf/index in inode body */ 141 /* path points to leaf/index in inode body */
140 err = ext4_mark_inode_dirty(handle, inode); 142 err = ext4_mark_inode_dirty(handle, inode);
@@ -191,7 +193,7 @@ ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
191 ext4_fsblk_t goal, newblock; 193 ext4_fsblk_t goal, newblock;
192 194
193 goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block)); 195 goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
194 newblock = ext4_new_meta_block(handle, inode, goal, err); 196 newblock = ext4_new_meta_blocks(handle, inode, goal, NULL, err);
195 return newblock; 197 return newblock;
196} 198}
197 199
@@ -780,7 +782,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
780 set_buffer_uptodate(bh); 782 set_buffer_uptodate(bh);
781 unlock_buffer(bh); 783 unlock_buffer(bh);
782 784
783 err = ext4_journal_dirty_metadata(handle, bh); 785 err = ext4_handle_dirty_metadata(handle, inode, bh);
784 if (err) 786 if (err)
785 goto cleanup; 787 goto cleanup;
786 brelse(bh); 788 brelse(bh);
@@ -859,7 +861,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
859 set_buffer_uptodate(bh); 861 set_buffer_uptodate(bh);
860 unlock_buffer(bh); 862 unlock_buffer(bh);
861 863
862 err = ext4_journal_dirty_metadata(handle, bh); 864 err = ext4_handle_dirty_metadata(handle, inode, bh);
863 if (err) 865 if (err)
864 goto cleanup; 866 goto cleanup;
865 brelse(bh); 867 brelse(bh);
@@ -955,7 +957,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
955 set_buffer_uptodate(bh); 957 set_buffer_uptodate(bh);
956 unlock_buffer(bh); 958 unlock_buffer(bh);
957 959
958 err = ext4_journal_dirty_metadata(handle, bh); 960 err = ext4_handle_dirty_metadata(handle, inode, bh);
959 if (err) 961 if (err)
960 goto out; 962 goto out;
961 963
@@ -1160,15 +1162,13 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
1160 while (--depth >= 0) { 1162 while (--depth >= 0) {
1161 ix = path[depth].p_idx; 1163 ix = path[depth].p_idx;
1162 if (ix != EXT_LAST_INDEX(path[depth].p_hdr)) 1164 if (ix != EXT_LAST_INDEX(path[depth].p_hdr))
1163 break; 1165 goto got_index;
1164 } 1166 }
1165 1167
1166 if (depth < 0) { 1168 /* we've gone up to the root and found no index to the right */
1167 /* we've gone up to the root and 1169 return 0;
1168 * found no index to the right */
1169 return 0;
1170 }
1171 1170
1171got_index:
1172 /* we've found index to the right, let's 1172 /* we've found index to the right, let's
1173 * follow it and find the closest allocated 1173 * follow it and find the closest allocated
1174 * block to the right */ 1174 * block to the right */
@@ -1201,7 +1201,6 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
1201 *phys = ext_pblock(ex); 1201 *phys = ext_pblock(ex);
1202 put_bh(bh); 1202 put_bh(bh);
1203 return 0; 1203 return 0;
1204
1205} 1204}
1206 1205
1207/* 1206/*
@@ -1622,7 +1621,6 @@ cleanup:
1622 ext4_ext_drop_refs(npath); 1621 ext4_ext_drop_refs(npath);
1623 kfree(npath); 1622 kfree(npath);
1624 } 1623 }
1625 ext4_ext_tree_changed(inode);
1626 ext4_ext_invalidate_cache(inode); 1624 ext4_ext_invalidate_cache(inode);
1627 return err; 1625 return err;
1628} 1626}
@@ -2233,7 +2231,6 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
2233 } 2231 }
2234 } 2232 }
2235out: 2233out:
2236 ext4_ext_tree_changed(inode);
2237 ext4_ext_drop_refs(path); 2234 ext4_ext_drop_refs(path);
2238 kfree(path); 2235 kfree(path);
2239 ext4_journal_stop(handle); 2236 ext4_journal_stop(handle);
@@ -2250,7 +2247,7 @@ void ext4_ext_init(struct super_block *sb)
2250 * possible initialization would be here 2247 * possible initialization would be here
2251 */ 2248 */
2252 2249
2253 if (test_opt(sb, EXTENTS)) { 2250 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
2254 printk(KERN_INFO "EXT4-fs: file extents enabled"); 2251 printk(KERN_INFO "EXT4-fs: file extents enabled");
2255#ifdef AGGRESSIVE_TEST 2252#ifdef AGGRESSIVE_TEST
2256 printk(", aggressive tests"); 2253 printk(", aggressive tests");
@@ -2275,7 +2272,7 @@ void ext4_ext_init(struct super_block *sb)
2275 */ 2272 */
2276void ext4_ext_release(struct super_block *sb) 2273void ext4_ext_release(struct super_block *sb)
2277{ 2274{
2278 if (!test_opt(sb, EXTENTS)) 2275 if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
2279 return; 2276 return;
2280 2277
2281#ifdef EXTENTS_STATS 2278#ifdef EXTENTS_STATS
@@ -2380,7 +2377,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2380 struct inode *inode, 2377 struct inode *inode,
2381 struct ext4_ext_path *path, 2378 struct ext4_ext_path *path,
2382 ext4_lblk_t iblock, 2379 ext4_lblk_t iblock,
2383 unsigned long max_blocks) 2380 unsigned int max_blocks)
2384{ 2381{
2385 struct ext4_extent *ex, newex, orig_ex; 2382 struct ext4_extent *ex, newex, orig_ex;
2386 struct ext4_extent *ex1 = NULL; 2383 struct ext4_extent *ex1 = NULL;
@@ -2536,7 +2533,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2536 */ 2533 */
2537 newdepth = ext_depth(inode); 2534 newdepth = ext_depth(inode);
2538 /* 2535 /*
2539 * update the extent length after successfull insert of the 2536 * update the extent length after successful insert of the
2540 * split extent 2537 * split extent
2541 */ 2538 */
2542 orig_ex.ee_len = cpu_to_le16(ee_len - 2539 orig_ex.ee_len = cpu_to_le16(ee_len -
@@ -2678,26 +2675,26 @@ fix_extent_len:
2678 */ 2675 */
2679int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, 2676int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2680 ext4_lblk_t iblock, 2677 ext4_lblk_t iblock,
2681 unsigned long max_blocks, struct buffer_head *bh_result, 2678 unsigned int max_blocks, struct buffer_head *bh_result,
2682 int create, int extend_disksize) 2679 int create, int extend_disksize)
2683{ 2680{
2684 struct ext4_ext_path *path = NULL; 2681 struct ext4_ext_path *path = NULL;
2685 struct ext4_extent_header *eh; 2682 struct ext4_extent_header *eh;
2686 struct ext4_extent newex, *ex; 2683 struct ext4_extent newex, *ex;
2687 ext4_fsblk_t goal, newblock; 2684 ext4_fsblk_t newblock;
2688 int err = 0, depth, ret; 2685 int err = 0, depth, ret, cache_type;
2689 unsigned long allocated = 0; 2686 unsigned int allocated = 0;
2690 struct ext4_allocation_request ar; 2687 struct ext4_allocation_request ar;
2691 loff_t disksize; 2688 loff_t disksize;
2692 2689
2693 __clear_bit(BH_New, &bh_result->b_state); 2690 __clear_bit(BH_New, &bh_result->b_state);
2694 ext_debug("blocks %u/%lu requested for inode %u\n", 2691 ext_debug("blocks %u/%u requested for inode %u\n",
2695 iblock, max_blocks, inode->i_ino); 2692 iblock, max_blocks, inode->i_ino);
2696 2693
2697 /* check in cache */ 2694 /* check in cache */
2698 goal = ext4_ext_in_cache(inode, iblock, &newex); 2695 cache_type = ext4_ext_in_cache(inode, iblock, &newex);
2699 if (goal) { 2696 if (cache_type) {
2700 if (goal == EXT4_EXT_CACHE_GAP) { 2697 if (cache_type == EXT4_EXT_CACHE_GAP) {
2701 if (!create) { 2698 if (!create) {
2702 /* 2699 /*
2703 * block isn't allocated yet and 2700 * block isn't allocated yet and
@@ -2706,7 +2703,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2706 goto out2; 2703 goto out2;
2707 } 2704 }
2708 /* we should allocate requested block */ 2705 /* we should allocate requested block */
2709 } else if (goal == EXT4_EXT_CACHE_EXTENT) { 2706 } else if (cache_type == EXT4_EXT_CACHE_EXTENT) {
2710 /* block is already allocated */ 2707 /* block is already allocated */
2711 newblock = iblock 2708 newblock = iblock
2712 - le32_to_cpu(newex.ee_block) 2709 - le32_to_cpu(newex.ee_block)
@@ -2854,7 +2851,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2854 if (!newblock) 2851 if (!newblock)
2855 goto out2; 2852 goto out2;
2856 ext_debug("allocate new block: goal %llu, found %llu/%lu\n", 2853 ext_debug("allocate new block: goal %llu, found %llu/%lu\n",
2857 goal, newblock, allocated); 2854 ar.goal, newblock, allocated);
2858 2855
2859 /* try to insert new extent into found leaf and return */ 2856 /* try to insert new extent into found leaf and return */
2860 ext4_ext_store_pblock(&newex, newblock); 2857 ext4_ext_store_pblock(&newex, newblock);
@@ -2950,7 +2947,7 @@ void ext4_ext_truncate(struct inode *inode)
2950 * transaction synchronous. 2947 * transaction synchronous.
2951 */ 2948 */
2952 if (IS_SYNC(inode)) 2949 if (IS_SYNC(inode))
2953 handle->h_sync = 1; 2950 ext4_handle_sync(handle);
2954 2951
2955out_stop: 2952out_stop:
2956 up_write(&EXT4_I(inode)->i_data_sem); 2953 up_write(&EXT4_I(inode)->i_data_sem);
@@ -3004,7 +3001,7 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
3004 handle_t *handle; 3001 handle_t *handle;
3005 ext4_lblk_t block; 3002 ext4_lblk_t block;
3006 loff_t new_size; 3003 loff_t new_size;
3007 unsigned long max_blocks; 3004 unsigned int max_blocks;
3008 int ret = 0; 3005 int ret = 0;
3009 int ret2 = 0; 3006 int ret2 = 0;
3010 int retries = 0; 3007 int retries = 0;
@@ -3051,7 +3048,7 @@ retry:
3051 WARN_ON(ret <= 0); 3048 WARN_ON(ret <= 0);
3052 printk(KERN_ERR "%s: ext4_ext_get_blocks " 3049 printk(KERN_ERR "%s: ext4_ext_get_blocks "
3053 "returned error inode#%lu, block=%u, " 3050 "returned error inode#%lu, block=%u, "
3054 "max_blocks=%lu", __func__, 3051 "max_blocks=%u", __func__,
3055 inode->i_ino, block, max_blocks); 3052 inode->i_ino, block, max_blocks);
3056#endif 3053#endif
3057 ext4_mark_inode_dirty(handle, inode); 3054 ext4_mark_inode_dirty(handle, inode);
@@ -3083,7 +3080,7 @@ retry:
3083/* 3080/*
3084 * Callback function called for each extent to gather FIEMAP information. 3081 * Callback function called for each extent to gather FIEMAP information.
3085 */ 3082 */
3086int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, 3083static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
3087 struct ext4_ext_cache *newex, struct ext4_extent *ex, 3084 struct ext4_ext_cache *newex, struct ext4_extent *ex,
3088 void *data) 3085 void *data)
3089{ 3086{
@@ -3152,7 +3149,8 @@ int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
3152/* fiemap flags we can handle specified here */ 3149/* fiemap flags we can handle specified here */
3153#define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) 3150#define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
3154 3151
3155int ext4_xattr_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo) 3152static int ext4_xattr_fiemap(struct inode *inode,
3153 struct fiemap_extent_info *fieinfo)
3156{ 3154{
3157 __u64 physical = 0; 3155 __u64 physical = 0;
3158 __u64 length; 3156 __u64 length;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 6bd11fba71f7..f731cb545a03 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -140,9 +140,6 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
140 return 0; 140 return 0;
141} 141}
142 142
143extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
144 __u64 start, __u64 len);
145
146const struct file_operations ext4_file_operations = { 143const struct file_operations ext4_file_operations = {
147 .llseek = generic_file_llseek, 144 .llseek = generic_file_llseek,
148 .read = do_sync_read, 145 .read = do_sync_read,
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index 556ca8eba3db..ac8f168c8ab4 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -35,23 +35,71 @@ static void TEA_transform(__u32 buf[4], __u32 const in[])
35 35
36 36
37/* The old legacy hash */ 37/* The old legacy hash */
38static __u32 dx_hack_hash(const char *name, int len) 38static __u32 dx_hack_hash_unsigned(const char *name, int len)
39{ 39{
40 __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; 40 __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
41 const unsigned char *ucp = (const unsigned char *) name;
42
43 while (len--) {
44 hash = hash1 + (hash0 ^ (((int) *ucp++) * 7152373));
45
46 if (hash & 0x80000000)
47 hash -= 0x7fffffff;
48 hash1 = hash0;
49 hash0 = hash;
50 }
51 return hash0 << 1;
52}
53
54static __u32 dx_hack_hash_signed(const char *name, int len)
55{
56 __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
57 const signed char *scp = (const signed char *) name;
58
41 while (len--) { 59 while (len--) {
42 __u32 hash = hash1 + (hash0 ^ (*name++ * 7152373)); 60 hash = hash1 + (hash0 ^ (((int) *scp++) * 7152373));
43 61
44 if (hash & 0x80000000) hash -= 0x7fffffff; 62 if (hash & 0x80000000)
63 hash -= 0x7fffffff;
45 hash1 = hash0; 64 hash1 = hash0;
46 hash0 = hash; 65 hash0 = hash;
47 } 66 }
48 return (hash0 << 1); 67 return hash0 << 1;
68}
69
70static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num)
71{
72 __u32 pad, val;
73 int i;
74 const signed char *scp = (const signed char *) msg;
75
76 pad = (__u32)len | ((__u32)len << 8);
77 pad |= pad << 16;
78
79 val = pad;
80 if (len > num*4)
81 len = num * 4;
82 for (i = 0; i < len; i++) {
83 if ((i % 4) == 0)
84 val = pad;
85 val = ((int) scp[i]) + (val << 8);
86 if ((i % 4) == 3) {
87 *buf++ = val;
88 val = pad;
89 num--;
90 }
91 }
92 if (--num >= 0)
93 *buf++ = val;
94 while (--num >= 0)
95 *buf++ = pad;
49} 96}
50 97
51static void str2hashbuf(const char *msg, int len, __u32 *buf, int num) 98static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num)
52{ 99{
53 __u32 pad, val; 100 __u32 pad, val;
54 int i; 101 int i;
102 const unsigned char *ucp = (const unsigned char *) msg;
55 103
56 pad = (__u32)len | ((__u32)len << 8); 104 pad = (__u32)len | ((__u32)len << 8);
57 pad |= pad << 16; 105 pad |= pad << 16;
@@ -62,7 +110,7 @@ static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
62 for (i = 0; i < len; i++) { 110 for (i = 0; i < len; i++) {
63 if ((i % 4) == 0) 111 if ((i % 4) == 0)
64 val = pad; 112 val = pad;
65 val = msg[i] + (val << 8); 113 val = ((int) ucp[i]) + (val << 8);
66 if ((i % 4) == 3) { 114 if ((i % 4) == 3) {
67 *buf++ = val; 115 *buf++ = val;
68 val = pad; 116 val = pad;
@@ -95,6 +143,8 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
95 const char *p; 143 const char *p;
96 int i; 144 int i;
97 __u32 in[8], buf[4]; 145 __u32 in[8], buf[4];
146 void (*str2hashbuf)(const char *, int, __u32 *, int) =
147 str2hashbuf_signed;
98 148
99 /* Initialize the default seed for the hash checksum functions */ 149 /* Initialize the default seed for the hash checksum functions */
100 buf[0] = 0x67452301; 150 buf[0] = 0x67452301;
@@ -113,13 +163,18 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
113 } 163 }
114 164
115 switch (hinfo->hash_version) { 165 switch (hinfo->hash_version) {
166 case DX_HASH_LEGACY_UNSIGNED:
167 hash = dx_hack_hash_unsigned(name, len);
168 break;
116 case DX_HASH_LEGACY: 169 case DX_HASH_LEGACY:
117 hash = dx_hack_hash(name, len); 170 hash = dx_hack_hash_signed(name, len);
118 break; 171 break;
172 case DX_HASH_HALF_MD4_UNSIGNED:
173 str2hashbuf = str2hashbuf_unsigned;
119 case DX_HASH_HALF_MD4: 174 case DX_HASH_HALF_MD4:
120 p = name; 175 p = name;
121 while (len > 0) { 176 while (len > 0) {
122 str2hashbuf(p, len, in, 8); 177 (*str2hashbuf)(p, len, in, 8);
123 half_md4_transform(buf, in); 178 half_md4_transform(buf, in);
124 len -= 32; 179 len -= 32;
125 p += 32; 180 p += 32;
@@ -127,10 +182,12 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
127 minor_hash = buf[2]; 182 minor_hash = buf[2];
128 hash = buf[1]; 183 hash = buf[1];
129 break; 184 break;
185 case DX_HASH_TEA_UNSIGNED:
186 str2hashbuf = str2hashbuf_unsigned;
130 case DX_HASH_TEA: 187 case DX_HASH_TEA:
131 p = name; 188 p = name;
132 while (len > 0) { 189 while (len > 0) {
133 str2hashbuf(p, len, in, 4); 190 (*str2hashbuf)(p, len, in, 4);
134 TEA_transform(buf, in); 191 TEA_transform(buf, in);
135 len -= 16; 192 len -= 16;
136 p += 16; 193 p += 16;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 08cac9fcace2..4fb86a0061d0 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -74,17 +74,17 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh,
74 /* If checksum is bad mark all blocks and inodes use to prevent 74 /* If checksum is bad mark all blocks and inodes use to prevent
75 * allocation, essentially implementing a per-group read-only flag. */ 75 * allocation, essentially implementing a per-group read-only flag. */
76 if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { 76 if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
77 ext4_error(sb, __func__, "Checksum bad for group %lu\n", 77 ext4_error(sb, __func__, "Checksum bad for group %u",
78 block_group); 78 block_group);
79 gdp->bg_free_blocks_count = 0; 79 ext4_free_blks_set(sb, gdp, 0);
80 gdp->bg_free_inodes_count = 0; 80 ext4_free_inodes_set(sb, gdp, 0);
81 gdp->bg_itable_unused = 0; 81 ext4_itable_unused_set(sb, gdp, 0);
82 memset(bh->b_data, 0xff, sb->s_blocksize); 82 memset(bh->b_data, 0xff, sb->s_blocksize);
83 return 0; 83 return 0;
84 } 84 }
85 85
86 memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8); 86 memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
87 mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), EXT4_BLOCKS_PER_GROUP(sb), 87 mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
88 bh->b_data); 88 bh->b_data);
89 89
90 return EXT4_INODES_PER_GROUP(sb); 90 return EXT4_INODES_PER_GROUP(sb);
@@ -111,29 +111,49 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
111 if (unlikely(!bh)) { 111 if (unlikely(!bh)) {
112 ext4_error(sb, __func__, 112 ext4_error(sb, __func__,
113 "Cannot read inode bitmap - " 113 "Cannot read inode bitmap - "
114 "block_group = %lu, inode_bitmap = %llu", 114 "block_group = %u, inode_bitmap = %llu",
115 block_group, bitmap_blk); 115 block_group, bitmap_blk);
116 return NULL; 116 return NULL;
117 } 117 }
118 if (buffer_uptodate(bh) && 118 if (bitmap_uptodate(bh))
119 !(desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)))
120 return bh; 119 return bh;
121 120
122 lock_buffer(bh); 121 lock_buffer(bh);
122 if (bitmap_uptodate(bh)) {
123 unlock_buffer(bh);
124 return bh;
125 }
123 spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group)); 126 spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
124 if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { 127 if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
125 ext4_init_inode_bitmap(sb, bh, block_group, desc); 128 ext4_init_inode_bitmap(sb, bh, block_group, desc);
129 set_bitmap_uptodate(bh);
126 set_buffer_uptodate(bh); 130 set_buffer_uptodate(bh);
127 unlock_buffer(bh);
128 spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); 131 spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
132 unlock_buffer(bh);
129 return bh; 133 return bh;
130 } 134 }
131 spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); 135 spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
136 if (buffer_uptodate(bh)) {
137 /*
138 * if not uninit if bh is uptodate,
139 * bitmap is also uptodate
140 */
141 set_bitmap_uptodate(bh);
142 unlock_buffer(bh);
143 return bh;
144 }
145 /*
146 * submit the buffer_head for read. We can
147 * safely mark the bitmap as uptodate now.
148 * We do it here so the bitmap uptodate bit
149 * get set with buffer lock held.
150 */
151 set_bitmap_uptodate(bh);
132 if (bh_submit_read(bh) < 0) { 152 if (bh_submit_read(bh) < 0) {
133 put_bh(bh); 153 put_bh(bh);
134 ext4_error(sb, __func__, 154 ext4_error(sb, __func__,
135 "Cannot read inode bitmap - " 155 "Cannot read inode bitmap - "
136 "block_group = %lu, inode_bitmap = %llu", 156 "block_group = %u, inode_bitmap = %llu",
137 block_group, bitmap_blk); 157 block_group, bitmap_blk);
138 return NULL; 158 return NULL;
139 } 159 }
@@ -168,7 +188,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
168 struct ext4_group_desc *gdp; 188 struct ext4_group_desc *gdp;
169 struct ext4_super_block *es; 189 struct ext4_super_block *es;
170 struct ext4_sb_info *sbi; 190 struct ext4_sb_info *sbi;
171 int fatal = 0, err; 191 int fatal = 0, err, count;
172 ext4_group_t flex_group; 192 ext4_group_t flex_group;
173 193
174 if (atomic_read(&inode->i_count) > 1) { 194 if (atomic_read(&inode->i_count) > 1) {
@@ -190,6 +210,11 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
190 210
191 ino = inode->i_ino; 211 ino = inode->i_ino;
192 ext4_debug("freeing inode %lu\n", ino); 212 ext4_debug("freeing inode %lu\n", ino);
213 trace_mark(ext4_free_inode,
214 "dev %s ino %lu mode %d uid %lu gid %lu bocks %llu",
215 sb->s_id, inode->i_ino, inode->i_mode,
216 (unsigned long) inode->i_uid, (unsigned long) inode->i_gid,
217 (unsigned long long) inode->i_blocks);
193 218
194 /* 219 /*
195 * Note: we must free any quota before locking the superblock, 220 * Note: we must free any quota before locking the superblock,
@@ -236,9 +261,12 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
236 261
237 if (gdp) { 262 if (gdp) {
238 spin_lock(sb_bgl_lock(sbi, block_group)); 263 spin_lock(sb_bgl_lock(sbi, block_group));
239 le16_add_cpu(&gdp->bg_free_inodes_count, 1); 264 count = ext4_free_inodes_count(sb, gdp) + 1;
240 if (is_directory) 265 ext4_free_inodes_set(sb, gdp, count);
241 le16_add_cpu(&gdp->bg_used_dirs_count, -1); 266 if (is_directory) {
267 count = ext4_used_dirs_count(sb, gdp) - 1;
268 ext4_used_dirs_set(sb, gdp, count);
269 }
242 gdp->bg_checksum = ext4_group_desc_csum(sbi, 270 gdp->bg_checksum = ext4_group_desc_csum(sbi,
243 block_group, gdp); 271 block_group, gdp);
244 spin_unlock(sb_bgl_lock(sbi, block_group)); 272 spin_unlock(sb_bgl_lock(sbi, block_group));
@@ -253,12 +281,12 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
253 spin_unlock(sb_bgl_lock(sbi, flex_group)); 281 spin_unlock(sb_bgl_lock(sbi, flex_group));
254 } 282 }
255 } 283 }
256 BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata"); 284 BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
257 err = ext4_journal_dirty_metadata(handle, bh2); 285 err = ext4_handle_dirty_metadata(handle, NULL, bh2);
258 if (!fatal) fatal = err; 286 if (!fatal) fatal = err;
259 } 287 }
260 BUFFER_TRACE(bitmap_bh, "call ext4_journal_dirty_metadata"); 288 BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata");
261 err = ext4_journal_dirty_metadata(handle, bitmap_bh); 289 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
262 if (!fatal) 290 if (!fatal)
263 fatal = err; 291 fatal = err;
264 sb->s_dirt = 1; 292 sb->s_dirt = 1;
@@ -291,13 +319,13 @@ static int find_group_dir(struct super_block *sb, struct inode *parent,
291 319
292 for (group = 0; group < ngroups; group++) { 320 for (group = 0; group < ngroups; group++) {
293 desc = ext4_get_group_desc(sb, group, NULL); 321 desc = ext4_get_group_desc(sb, group, NULL);
294 if (!desc || !desc->bg_free_inodes_count) 322 if (!desc || !ext4_free_inodes_count(sb, desc))
295 continue; 323 continue;
296 if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei) 324 if (ext4_free_inodes_count(sb, desc) < avefreei)
297 continue; 325 continue;
298 if (!best_desc || 326 if (!best_desc ||
299 (le16_to_cpu(desc->bg_free_blocks_count) > 327 (ext4_free_blks_count(sb, desc) >
300 le16_to_cpu(best_desc->bg_free_blocks_count))) { 328 ext4_free_blks_count(sb, best_desc))) {
301 *best_group = group; 329 *best_group = group;
302 best_desc = desc; 330 best_desc = desc;
303 ret = 0; 331 ret = 0;
@@ -369,7 +397,7 @@ found_flexbg:
369 for (i = best_flex * flex_size; i < ngroups && 397 for (i = best_flex * flex_size; i < ngroups &&
370 i < (best_flex + 1) * flex_size; i++) { 398 i < (best_flex + 1) * flex_size; i++) {
371 desc = ext4_get_group_desc(sb, i, &bh); 399 desc = ext4_get_group_desc(sb, i, &bh);
372 if (le16_to_cpu(desc->bg_free_inodes_count)) { 400 if (ext4_free_inodes_count(sb, desc)) {
373 *best_group = i; 401 *best_group = i;
374 goto out; 402 goto out;
375 } 403 }
@@ -443,17 +471,17 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
443 for (i = 0; i < ngroups; i++) { 471 for (i = 0; i < ngroups; i++) {
444 grp = (parent_group + i) % ngroups; 472 grp = (parent_group + i) % ngroups;
445 desc = ext4_get_group_desc(sb, grp, NULL); 473 desc = ext4_get_group_desc(sb, grp, NULL);
446 if (!desc || !desc->bg_free_inodes_count) 474 if (!desc || !ext4_free_inodes_count(sb, desc))
447 continue; 475 continue;
448 if (le16_to_cpu(desc->bg_used_dirs_count) >= best_ndir) 476 if (ext4_used_dirs_count(sb, desc) >= best_ndir)
449 continue; 477 continue;
450 if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei) 478 if (ext4_free_inodes_count(sb, desc) < avefreei)
451 continue; 479 continue;
452 if (le16_to_cpu(desc->bg_free_blocks_count) < avefreeb) 480 if (ext4_free_blks_count(sb, desc) < avefreeb)
453 continue; 481 continue;
454 *group = grp; 482 *group = grp;
455 ret = 0; 483 ret = 0;
456 best_ndir = le16_to_cpu(desc->bg_used_dirs_count); 484 best_ndir = ext4_used_dirs_count(sb, desc);
457 } 485 }
458 if (ret == 0) 486 if (ret == 0)
459 return ret; 487 return ret;
@@ -479,13 +507,13 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
479 for (i = 0; i < ngroups; i++) { 507 for (i = 0; i < ngroups; i++) {
480 *group = (parent_group + i) % ngroups; 508 *group = (parent_group + i) % ngroups;
481 desc = ext4_get_group_desc(sb, *group, NULL); 509 desc = ext4_get_group_desc(sb, *group, NULL);
482 if (!desc || !desc->bg_free_inodes_count) 510 if (!desc || !ext4_free_inodes_count(sb, desc))
483 continue; 511 continue;
484 if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs) 512 if (ext4_used_dirs_count(sb, desc) >= max_dirs)
485 continue; 513 continue;
486 if (le16_to_cpu(desc->bg_free_inodes_count) < min_inodes) 514 if (ext4_free_inodes_count(sb, desc) < min_inodes)
487 continue; 515 continue;
488 if (le16_to_cpu(desc->bg_free_blocks_count) < min_blocks) 516 if (ext4_free_blks_count(sb, desc) < min_blocks)
489 continue; 517 continue;
490 return 0; 518 return 0;
491 } 519 }
@@ -494,8 +522,8 @@ fallback:
494 for (i = 0; i < ngroups; i++) { 522 for (i = 0; i < ngroups; i++) {
495 *group = (parent_group + i) % ngroups; 523 *group = (parent_group + i) % ngroups;
496 desc = ext4_get_group_desc(sb, *group, NULL); 524 desc = ext4_get_group_desc(sb, *group, NULL);
497 if (desc && desc->bg_free_inodes_count && 525 if (desc && ext4_free_inodes_count(sb, desc) &&
498 le16_to_cpu(desc->bg_free_inodes_count) >= avefreei) 526 ext4_free_inodes_count(sb, desc) >= avefreei)
499 return 0; 527 return 0;
500 } 528 }
501 529
@@ -524,8 +552,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
524 */ 552 */
525 *group = parent_group; 553 *group = parent_group;
526 desc = ext4_get_group_desc(sb, *group, NULL); 554 desc = ext4_get_group_desc(sb, *group, NULL);
527 if (desc && le16_to_cpu(desc->bg_free_inodes_count) && 555 if (desc && ext4_free_inodes_count(sb, desc) &&
528 le16_to_cpu(desc->bg_free_blocks_count)) 556 ext4_free_blks_count(sb, desc))
529 return 0; 557 return 0;
530 558
531 /* 559 /*
@@ -548,8 +576,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
548 if (*group >= ngroups) 576 if (*group >= ngroups)
549 *group -= ngroups; 577 *group -= ngroups;
550 desc = ext4_get_group_desc(sb, *group, NULL); 578 desc = ext4_get_group_desc(sb, *group, NULL);
551 if (desc && le16_to_cpu(desc->bg_free_inodes_count) && 579 if (desc && ext4_free_inodes_count(sb, desc) &&
552 le16_to_cpu(desc->bg_free_blocks_count)) 580 ext4_free_blks_count(sb, desc))
553 return 0; 581 return 0;
554 } 582 }
555 583
@@ -562,7 +590,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
562 if (++*group >= ngroups) 590 if (++*group >= ngroups)
563 *group = 0; 591 *group = 0;
564 desc = ext4_get_group_desc(sb, *group, NULL); 592 desc = ext4_get_group_desc(sb, *group, NULL);
565 if (desc && le16_to_cpu(desc->bg_free_inodes_count)) 593 if (desc && ext4_free_inodes_count(sb, desc))
566 return 0; 594 return 0;
567 } 595 }
568 596
@@ -570,6 +598,79 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
570} 598}
571 599
572/* 600/*
601 * claim the inode from the inode bitmap. If the group
602 * is uninit we need to take the groups's sb_bgl_lock
603 * and clear the uninit flag. The inode bitmap update
604 * and group desc uninit flag clear should be done
605 * after holding sb_bgl_lock so that ext4_read_inode_bitmap
606 * doesn't race with the ext4_claim_inode
607 */
608static int ext4_claim_inode(struct super_block *sb,
609 struct buffer_head *inode_bitmap_bh,
610 unsigned long ino, ext4_group_t group, int mode)
611{
612 int free = 0, retval = 0, count;
613 struct ext4_sb_info *sbi = EXT4_SB(sb);
614 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
615
616 spin_lock(sb_bgl_lock(sbi, group));
617 if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) {
618 /* not a free inode */
619 retval = 1;
620 goto err_ret;
621 }
622 ino++;
623 if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
624 ino > EXT4_INODES_PER_GROUP(sb)) {
625 spin_unlock(sb_bgl_lock(sbi, group));
626 ext4_error(sb, __func__,
627 "reserved inode or inode > inodes count - "
628 "block_group = %u, inode=%lu", group,
629 ino + group * EXT4_INODES_PER_GROUP(sb));
630 return 1;
631 }
632 /* If we didn't allocate from within the initialized part of the inode
633 * table then we need to initialize up to this inode. */
634 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
635
636 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
637 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
638 /* When marking the block group with
639 * ~EXT4_BG_INODE_UNINIT we don't want to depend
640 * on the value of bg_itable_unused even though
641 * mke2fs could have initialized the same for us.
642 * Instead we calculated the value below
643 */
644
645 free = 0;
646 } else {
647 free = EXT4_INODES_PER_GROUP(sb) -
648 ext4_itable_unused_count(sb, gdp);
649 }
650
651 /*
652 * Check the relative inode number against the last used
653 * relative inode number in this group. if it is greater
654 * we need to update the bg_itable_unused count
655 *
656 */
657 if (ino > free)
658 ext4_itable_unused_set(sb, gdp,
659 (EXT4_INODES_PER_GROUP(sb) - ino));
660 }
661 count = ext4_free_inodes_count(sb, gdp) - 1;
662 ext4_free_inodes_set(sb, gdp, count);
663 if (S_ISDIR(mode)) {
664 count = ext4_used_dirs_count(sb, gdp) + 1;
665 ext4_used_dirs_set(sb, gdp, count);
666 }
667 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
668err_ret:
669 spin_unlock(sb_bgl_lock(sbi, group));
670 return retval;
671}
672
673/*
573 * There are two policies for allocating an inode. If the new inode is 674 * There are two policies for allocating an inode. If the new inode is
574 * a directory, then a forward search is made for a block group with both 675 * a directory, then a forward search is made for a block group with both
575 * free space and a low directory-to-inode ratio; if that fails, then of 676 * free space and a low directory-to-inode ratio; if that fails, then of
@@ -582,8 +683,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
582struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) 683struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
583{ 684{
584 struct super_block *sb; 685 struct super_block *sb;
585 struct buffer_head *bitmap_bh = NULL; 686 struct buffer_head *inode_bitmap_bh = NULL;
586 struct buffer_head *bh2; 687 struct buffer_head *group_desc_bh;
587 ext4_group_t group = 0; 688 ext4_group_t group = 0;
588 unsigned long ino = 0; 689 unsigned long ino = 0;
589 struct inode *inode; 690 struct inode *inode;
@@ -602,6 +703,8 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
602 return ERR_PTR(-EPERM); 703 return ERR_PTR(-EPERM);
603 704
604 sb = dir->i_sb; 705 sb = dir->i_sb;
706 trace_mark(ext4_request_inode, "dev %s dir %lu mode %d", sb->s_id,
707 dir->i_ino, mode);
605 inode = new_inode(sb); 708 inode = new_inode(sb);
606 if (!inode) 709 if (!inode)
607 return ERR_PTR(-ENOMEM); 710 return ERR_PTR(-ENOMEM);
@@ -631,40 +734,52 @@ got_group:
631 for (i = 0; i < sbi->s_groups_count; i++) { 734 for (i = 0; i < sbi->s_groups_count; i++) {
632 err = -EIO; 735 err = -EIO;
633 736
634 gdp = ext4_get_group_desc(sb, group, &bh2); 737 gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
635 if (!gdp) 738 if (!gdp)
636 goto fail; 739 goto fail;
637 740
638 brelse(bitmap_bh); 741 brelse(inode_bitmap_bh);
639 bitmap_bh = ext4_read_inode_bitmap(sb, group); 742 inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
640 if (!bitmap_bh) 743 if (!inode_bitmap_bh)
641 goto fail; 744 goto fail;
642 745
643 ino = 0; 746 ino = 0;
644 747
645repeat_in_this_group: 748repeat_in_this_group:
646 ino = ext4_find_next_zero_bit((unsigned long *) 749 ino = ext4_find_next_zero_bit((unsigned long *)
647 bitmap_bh->b_data, EXT4_INODES_PER_GROUP(sb), ino); 750 inode_bitmap_bh->b_data,
751 EXT4_INODES_PER_GROUP(sb), ino);
752
648 if (ino < EXT4_INODES_PER_GROUP(sb)) { 753 if (ino < EXT4_INODES_PER_GROUP(sb)) {
649 754
650 BUFFER_TRACE(bitmap_bh, "get_write_access"); 755 BUFFER_TRACE(inode_bitmap_bh, "get_write_access");
651 err = ext4_journal_get_write_access(handle, bitmap_bh); 756 err = ext4_journal_get_write_access(handle,
757 inode_bitmap_bh);
652 if (err) 758 if (err)
653 goto fail; 759 goto fail;
654 760
655 if (!ext4_set_bit_atomic(sb_bgl_lock(sbi, group), 761 BUFFER_TRACE(group_desc_bh, "get_write_access");
656 ino, bitmap_bh->b_data)) { 762 err = ext4_journal_get_write_access(handle,
763 group_desc_bh);
764 if (err)
765 goto fail;
766 if (!ext4_claim_inode(sb, inode_bitmap_bh,
767 ino, group, mode)) {
657 /* we won it */ 768 /* we won it */
658 BUFFER_TRACE(bitmap_bh, 769 BUFFER_TRACE(inode_bitmap_bh,
659 "call ext4_journal_dirty_metadata"); 770 "call ext4_handle_dirty_metadata");
660 err = ext4_journal_dirty_metadata(handle, 771 err = ext4_handle_dirty_metadata(handle,
661 bitmap_bh); 772 inode,
773 inode_bitmap_bh);
662 if (err) 774 if (err)
663 goto fail; 775 goto fail;
776 /* zero bit is inode number 1*/
777 ino++;
664 goto got; 778 goto got;
665 } 779 }
666 /* we lost it */ 780 /* we lost it */
667 jbd2_journal_release_buffer(handle, bitmap_bh); 781 ext4_handle_release_buffer(handle, inode_bitmap_bh);
782 ext4_handle_release_buffer(handle, group_desc_bh);
668 783
669 if (++ino < EXT4_INODES_PER_GROUP(sb)) 784 if (++ino < EXT4_INODES_PER_GROUP(sb))
670 goto repeat_in_this_group; 785 goto repeat_in_this_group;
@@ -684,30 +799,16 @@ repeat_in_this_group:
684 goto out; 799 goto out;
685 800
686got: 801got:
687 ino++;
688 if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
689 ino > EXT4_INODES_PER_GROUP(sb)) {
690 ext4_error(sb, __func__,
691 "reserved inode or inode > inodes count - "
692 "block_group = %lu, inode=%lu", group,
693 ino + group * EXT4_INODES_PER_GROUP(sb));
694 err = -EIO;
695 goto fail;
696 }
697
698 BUFFER_TRACE(bh2, "get_write_access");
699 err = ext4_journal_get_write_access(handle, bh2);
700 if (err) goto fail;
701
702 /* We may have to initialize the block bitmap if it isn't already */ 802 /* We may have to initialize the block bitmap if it isn't already */
703 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) && 803 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) &&
704 gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 804 gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
705 struct buffer_head *block_bh = ext4_read_block_bitmap(sb, group); 805 struct buffer_head *block_bitmap_bh;
706 806
707 BUFFER_TRACE(block_bh, "get block bitmap access"); 807 block_bitmap_bh = ext4_read_block_bitmap(sb, group);
708 err = ext4_journal_get_write_access(handle, block_bh); 808 BUFFER_TRACE(block_bitmap_bh, "get block bitmap access");
809 err = ext4_journal_get_write_access(handle, block_bitmap_bh);
709 if (err) { 810 if (err) {
710 brelse(block_bh); 811 brelse(block_bitmap_bh);
711 goto fail; 812 goto fail;
712 } 813 }
713 814
@@ -715,9 +816,9 @@ got:
715 spin_lock(sb_bgl_lock(sbi, group)); 816 spin_lock(sb_bgl_lock(sbi, group));
716 /* recheck and clear flag under lock if we still need to */ 817 /* recheck and clear flag under lock if we still need to */
717 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 818 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
718 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
719 free = ext4_free_blocks_after_init(sb, group, gdp); 819 free = ext4_free_blocks_after_init(sb, group, gdp);
720 gdp->bg_free_blocks_count = cpu_to_le16(free); 820 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
821 ext4_free_blks_set(sb, gdp, free);
721 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, 822 gdp->bg_checksum = ext4_group_desc_csum(sbi, group,
722 gdp); 823 gdp);
723 } 824 }
@@ -725,55 +826,19 @@ got:
725 826
726 /* Don't need to dirty bitmap block if we didn't change it */ 827 /* Don't need to dirty bitmap block if we didn't change it */
727 if (free) { 828 if (free) {
728 BUFFER_TRACE(block_bh, "dirty block bitmap"); 829 BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
729 err = ext4_journal_dirty_metadata(handle, block_bh); 830 err = ext4_handle_dirty_metadata(handle,
831 NULL, block_bitmap_bh);
730 } 832 }
731 833
732 brelse(block_bh); 834 brelse(block_bitmap_bh);
733 if (err) 835 if (err)
734 goto fail; 836 goto fail;
735 } 837 }
736 838 BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata");
737 spin_lock(sb_bgl_lock(sbi, group)); 839 err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh);
738 /* If we didn't allocate from within the initialized part of the inode 840 if (err)
739 * table then we need to initialize up to this inode. */ 841 goto fail;
740 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
741 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
742 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
743
744 /* When marking the block group with
745 * ~EXT4_BG_INODE_UNINIT we don't want to depend
746 * on the value of bg_itable_unused even though
747 * mke2fs could have initialized the same for us.
748 * Instead we calculated the value below
749 */
750
751 free = 0;
752 } else {
753 free = EXT4_INODES_PER_GROUP(sb) -
754 le16_to_cpu(gdp->bg_itable_unused);
755 }
756
757 /*
758 * Check the relative inode number against the last used
759 * relative inode number in this group. if it is greater
760 * we need to update the bg_itable_unused count
761 *
762 */
763 if (ino > free)
764 gdp->bg_itable_unused =
765 cpu_to_le16(EXT4_INODES_PER_GROUP(sb) - ino);
766 }
767
768 le16_add_cpu(&gdp->bg_free_inodes_count, -1);
769 if (S_ISDIR(mode)) {
770 le16_add_cpu(&gdp->bg_used_dirs_count, 1);
771 }
772 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
773 spin_unlock(sb_bgl_lock(sbi, group));
774 BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata");
775 err = ext4_journal_dirty_metadata(handle, bh2);
776 if (err) goto fail;
777 842
778 percpu_counter_dec(&sbi->s_freeinodes_counter); 843 percpu_counter_dec(&sbi->s_freeinodes_counter);
779 if (S_ISDIR(mode)) 844 if (S_ISDIR(mode))
@@ -825,8 +890,11 @@ got:
825 890
826 ext4_set_inode_flags(inode); 891 ext4_set_inode_flags(inode);
827 if (IS_DIRSYNC(inode)) 892 if (IS_DIRSYNC(inode))
828 handle->h_sync = 1; 893 ext4_handle_sync(handle);
829 insert_inode_hash(inode); 894 if (insert_inode_locked(inode) < 0) {
895 err = -EINVAL;
896 goto fail_drop;
897 }
830 spin_lock(&sbi->s_next_gen_lock); 898 spin_lock(&sbi->s_next_gen_lock);
831 inode->i_generation = sbi->s_next_generation++; 899 inode->i_generation = sbi->s_next_generation++;
832 spin_unlock(&sbi->s_next_gen_lock); 900 spin_unlock(&sbi->s_next_gen_lock);
@@ -849,7 +917,7 @@ got:
849 if (err) 917 if (err)
850 goto fail_free_drop; 918 goto fail_free_drop;
851 919
852 if (test_opt(sb, EXTENTS)) { 920 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
853 /* set extent flag only for directory, file and normal symlink*/ 921 /* set extent flag only for directory, file and normal symlink*/
854 if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) { 922 if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
855 EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL; 923 EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL;
@@ -864,6 +932,8 @@ got:
864 } 932 }
865 933
866 ext4_debug("allocating inode %lu\n", inode->i_ino); 934 ext4_debug("allocating inode %lu\n", inode->i_ino);
935 trace_mark(ext4_allocate_inode, "dev %s ino %lu dir %lu mode %d",
936 sb->s_id, inode->i_ino, dir->i_ino, mode);
867 goto really_out; 937 goto really_out;
868fail: 938fail:
869 ext4_std_error(sb, err); 939 ext4_std_error(sb, err);
@@ -871,7 +941,7 @@ out:
871 iput(inode); 941 iput(inode);
872 ret = ERR_PTR(err); 942 ret = ERR_PTR(err);
873really_out: 943really_out:
874 brelse(bitmap_bh); 944 brelse(inode_bitmap_bh);
875 return ret; 945 return ret;
876 946
877fail_free_drop: 947fail_free_drop:
@@ -881,8 +951,9 @@ fail_drop:
881 DQUOT_DROP(inode); 951 DQUOT_DROP(inode);
882 inode->i_flags |= S_NOQUOTA; 952 inode->i_flags |= S_NOQUOTA;
883 inode->i_nlink = 0; 953 inode->i_nlink = 0;
954 unlock_new_inode(inode);
884 iput(inode); 955 iput(inode);
885 brelse(bitmap_bh); 956 brelse(inode_bitmap_bh);
886 return ERR_PTR(err); 957 return ERR_PTR(err);
887} 958}
888 959
@@ -981,7 +1052,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
981 gdp = ext4_get_group_desc(sb, i, NULL); 1052 gdp = ext4_get_group_desc(sb, i, NULL);
982 if (!gdp) 1053 if (!gdp)
983 continue; 1054 continue;
984 desc_count += le16_to_cpu(gdp->bg_free_inodes_count); 1055 desc_count += ext4_free_inodes_count(sb, gdp);
985 brelse(bitmap_bh); 1056 brelse(bitmap_bh);
986 bitmap_bh = ext4_read_inode_bitmap(sb, i); 1057 bitmap_bh = ext4_read_inode_bitmap(sb, i);
987 if (!bitmap_bh) 1058 if (!bitmap_bh)
@@ -989,7 +1060,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
989 1060
990 x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8); 1061 x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8);
991 printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n", 1062 printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n",
992 i, le16_to_cpu(gdp->bg_free_inodes_count), x); 1063 i, ext4_free_inodes_count(sb, gdp), x);
993 bitmap_count += x; 1064 bitmap_count += x;
994 } 1065 }
995 brelse(bitmap_bh); 1066 brelse(bitmap_bh);
@@ -1003,7 +1074,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
1003 gdp = ext4_get_group_desc(sb, i, NULL); 1074 gdp = ext4_get_group_desc(sb, i, NULL);
1004 if (!gdp) 1075 if (!gdp)
1005 continue; 1076 continue;
1006 desc_count += le16_to_cpu(gdp->bg_free_inodes_count); 1077 desc_count += ext4_free_inodes_count(sb, gdp);
1007 cond_resched(); 1078 cond_resched();
1008 } 1079 }
1009 return desc_count; 1080 return desc_count;
@@ -1020,8 +1091,7 @@ unsigned long ext4_count_dirs(struct super_block * sb)
1020 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL); 1091 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
1021 if (!gdp) 1092 if (!gdp)
1022 continue; 1093 continue;
1023 count += le16_to_cpu(gdp->bg_used_dirs_count); 1094 count += ext4_used_dirs_count(sb, gdp);
1024 } 1095 }
1025 return count; 1096 return count;
1026} 1097}
1027
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index be21a5ae33cb..03ba20be1329 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -34,6 +34,7 @@
34#include <linux/writeback.h> 34#include <linux/writeback.h>
35#include <linux/pagevec.h> 35#include <linux/pagevec.h>
36#include <linux/mpage.h> 36#include <linux/mpage.h>
37#include <linux/namei.h>
37#include <linux/uio.h> 38#include <linux/uio.h>
38#include <linux/bio.h> 39#include <linux/bio.h>
39#include "ext4_jbd2.h" 40#include "ext4_jbd2.h"
@@ -71,12 +72,17 @@ static int ext4_inode_is_fast_symlink(struct inode *inode)
71 * "bh" may be NULL: a metadata block may have been freed from memory 72 * "bh" may be NULL: a metadata block may have been freed from memory
72 * but there may still be a record of it in the journal, and that record 73 * but there may still be a record of it in the journal, and that record
73 * still needs to be revoked. 74 * still needs to be revoked.
75 *
76 * If the handle isn't valid we're not journaling so there's nothing to do.
74 */ 77 */
75int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, 78int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
76 struct buffer_head *bh, ext4_fsblk_t blocknr) 79 struct buffer_head *bh, ext4_fsblk_t blocknr)
77{ 80{
78 int err; 81 int err;
79 82
83 if (!ext4_handle_valid(handle))
84 return 0;
85
80 might_sleep(); 86 might_sleep();
81 87
82 BUFFER_TRACE(bh, "enter"); 88 BUFFER_TRACE(bh, "enter");
@@ -169,7 +175,9 @@ static handle_t *start_transaction(struct inode *inode)
169 */ 175 */
170static int try_to_extend_transaction(handle_t *handle, struct inode *inode) 176static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
171{ 177{
172 if (handle->h_buffer_credits > EXT4_RESERVE_TRANS_BLOCKS) 178 if (!ext4_handle_valid(handle))
179 return 0;
180 if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1))
173 return 0; 181 return 0;
174 if (!ext4_journal_extend(handle, blocks_for_truncate(inode))) 182 if (!ext4_journal_extend(handle, blocks_for_truncate(inode)))
175 return 0; 183 return 0;
@@ -183,6 +191,7 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
183 */ 191 */
184static int ext4_journal_test_restart(handle_t *handle, struct inode *inode) 192static int ext4_journal_test_restart(handle_t *handle, struct inode *inode)
185{ 193{
194 BUG_ON(EXT4_JOURNAL(inode) == NULL);
186 jbd_debug(2, "restarting handle %p\n", handle); 195 jbd_debug(2, "restarting handle %p\n", handle);
187 return ext4_journal_restart(handle, blocks_for_truncate(inode)); 196 return ext4_journal_restart(handle, blocks_for_truncate(inode));
188} 197}
@@ -215,7 +224,7 @@ void ext4_delete_inode(struct inode *inode)
215 } 224 }
216 225
217 if (IS_SYNC(inode)) 226 if (IS_SYNC(inode))
218 handle->h_sync = 1; 227 ext4_handle_sync(handle);
219 inode->i_size = 0; 228 inode->i_size = 0;
220 err = ext4_mark_inode_dirty(handle, inode); 229 err = ext4_mark_inode_dirty(handle, inode);
221 if (err) { 230 if (err) {
@@ -232,7 +241,7 @@ void ext4_delete_inode(struct inode *inode)
232 * enough credits left in the handle to remove the inode from 241 * enough credits left in the handle to remove the inode from
233 * the orphan list and set the dtime field. 242 * the orphan list and set the dtime field.
234 */ 243 */
235 if (handle->h_buffer_credits < 3) { 244 if (!ext4_handle_has_enough_credits(handle, 3)) {
236 err = ext4_journal_extend(handle, 3); 245 err = ext4_journal_extend(handle, 3);
237 if (err > 0) 246 if (err > 0)
238 err = ext4_journal_restart(handle, 3); 247 err = ext4_journal_restart(handle, 3);
@@ -351,9 +360,9 @@ static int ext4_block_to_path(struct inode *inode,
351 final = ptrs; 360 final = ptrs;
352 } else { 361 } else {
353 ext4_warning(inode->i_sb, "ext4_block_to_path", 362 ext4_warning(inode->i_sb, "ext4_block_to_path",
354 "block %lu > max", 363 "block %lu > max in inode %lu",
355 i_block + direct_blocks + 364 i_block + direct_blocks +
356 indirect_blocks + double_blocks); 365 indirect_blocks + double_blocks, inode->i_ino);
357 } 366 }
358 if (boundary) 367 if (boundary)
359 *boundary = final - 1 - (i_block & (ptrs - 1)); 368 *boundary = final - 1 - (i_block & (ptrs - 1));
@@ -505,10 +514,10 @@ static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
505 * return the total number of blocks to be allocate, including the 514 * return the total number of blocks to be allocate, including the
506 * direct and indirect blocks. 515 * direct and indirect blocks.
507 */ 516 */
508static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned long blks, 517static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
509 int blocks_to_boundary) 518 int blocks_to_boundary)
510{ 519{
511 unsigned long count = 0; 520 unsigned int count = 0;
512 521
513 /* 522 /*
514 * Simple case, [t,d]Indirect block(s) has not allocated yet 523 * Simple case, [t,d]Indirect block(s) has not allocated yet
@@ -546,6 +555,7 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
546 int indirect_blks, int blks, 555 int indirect_blks, int blks,
547 ext4_fsblk_t new_blocks[4], int *err) 556 ext4_fsblk_t new_blocks[4], int *err)
548{ 557{
558 struct ext4_allocation_request ar;
549 int target, i; 559 int target, i;
550 unsigned long count = 0, blk_allocated = 0; 560 unsigned long count = 0, blk_allocated = 0;
551 int index = 0; 561 int index = 0;
@@ -594,10 +604,17 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
594 if (!target) 604 if (!target)
595 goto allocated; 605 goto allocated;
596 /* Now allocate data blocks */ 606 /* Now allocate data blocks */
597 count = target; 607 memset(&ar, 0, sizeof(ar));
598 /* allocating blocks for data blocks */ 608 ar.inode = inode;
599 current_block = ext4_new_blocks(handle, inode, iblock, 609 ar.goal = goal;
600 goal, &count, err); 610 ar.len = target;
611 ar.logical = iblock;
612 if (S_ISREG(inode->i_mode))
613 /* enable in-core preallocation only for regular files */
614 ar.flags = EXT4_MB_HINT_DATA;
615
616 current_block = ext4_mb_new_blocks(handle, &ar, err);
617
601 if (*err && (target == blks)) { 618 if (*err && (target == blks)) {
602 /* 619 /*
603 * if the allocation failed and we didn't allocate 620 * if the allocation failed and we didn't allocate
@@ -613,7 +630,7 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
613 */ 630 */
614 new_blocks[index] = current_block; 631 new_blocks[index] = current_block;
615 } 632 }
616 blk_allocated += count; 633 blk_allocated += ar.len;
617 } 634 }
618allocated: 635allocated:
619 /* total number of blocks allocated for direct blocks */ 636 /* total number of blocks allocated for direct blocks */
@@ -708,8 +725,8 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
708 set_buffer_uptodate(bh); 725 set_buffer_uptodate(bh);
709 unlock_buffer(bh); 726 unlock_buffer(bh);
710 727
711 BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); 728 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
712 err = ext4_journal_dirty_metadata(handle, bh); 729 err = ext4_handle_dirty_metadata(handle, inode, bh);
713 if (err) 730 if (err)
714 goto failed; 731 goto failed;
715 } 732 }
@@ -791,8 +808,8 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
791 * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode. 808 * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode.
792 */ 809 */
793 jbd_debug(5, "splicing indirect only\n"); 810 jbd_debug(5, "splicing indirect only\n");
794 BUFFER_TRACE(where->bh, "call ext4_journal_dirty_metadata"); 811 BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata");
795 err = ext4_journal_dirty_metadata(handle, where->bh); 812 err = ext4_handle_dirty_metadata(handle, inode, where->bh);
796 if (err) 813 if (err)
797 goto err_out; 814 goto err_out;
798 } else { 815 } else {
@@ -839,10 +856,10 @@ err_out:
839 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block 856 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
840 * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem) 857 * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
841 */ 858 */
842int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, 859static int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
843 ext4_lblk_t iblock, unsigned long maxblocks, 860 ext4_lblk_t iblock, unsigned int maxblocks,
844 struct buffer_head *bh_result, 861 struct buffer_head *bh_result,
845 int create, int extend_disksize) 862 int create, int extend_disksize)
846{ 863{
847 int err = -EIO; 864 int err = -EIO;
848 ext4_lblk_t offsets[4]; 865 ext4_lblk_t offsets[4];
@@ -1044,7 +1061,7 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
1044 * It returns the error in case of allocation failure. 1061 * It returns the error in case of allocation failure.
1045 */ 1062 */
1046int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, 1063int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
1047 unsigned long max_blocks, struct buffer_head *bh, 1064 unsigned int max_blocks, struct buffer_head *bh,
1048 int create, int extend_disksize, int flag) 1065 int create, int extend_disksize, int flag)
1049{ 1066{
1050 int retval; 1067 int retval;
@@ -1220,8 +1237,8 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
1220 set_buffer_uptodate(bh); 1237 set_buffer_uptodate(bh);
1221 } 1238 }
1222 unlock_buffer(bh); 1239 unlock_buffer(bh);
1223 BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); 1240 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
1224 err = ext4_journal_dirty_metadata(handle, bh); 1241 err = ext4_handle_dirty_metadata(handle, inode, bh);
1225 if (!fatal) 1242 if (!fatal)
1226 fatal = err; 1243 fatal = err;
1227 } else { 1244 } else {
@@ -1334,6 +1351,10 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
1334 pgoff_t index; 1351 pgoff_t index;
1335 unsigned from, to; 1352 unsigned from, to;
1336 1353
1354 trace_mark(ext4_write_begin,
1355 "dev %s ino %lu pos %llu len %u flags %u",
1356 inode->i_sb->s_id, inode->i_ino,
1357 (unsigned long long) pos, len, flags);
1337 index = pos >> PAGE_CACHE_SHIFT; 1358 index = pos >> PAGE_CACHE_SHIFT;
1338 from = pos & (PAGE_CACHE_SIZE - 1); 1359 from = pos & (PAGE_CACHE_SIZE - 1);
1339 to = from + len; 1360 to = from + len;
@@ -1345,7 +1366,7 @@ retry:
1345 goto out; 1366 goto out;
1346 } 1367 }
1347 1368
1348 page = __grab_cache_page(mapping, index); 1369 page = grab_cache_page_write_begin(mapping, index, flags);
1349 if (!page) { 1370 if (!page) {
1350 ext4_journal_stop(handle); 1371 ext4_journal_stop(handle);
1351 ret = -ENOMEM; 1372 ret = -ENOMEM;
@@ -1386,7 +1407,7 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh)
1386 if (!buffer_mapped(bh) || buffer_freed(bh)) 1407 if (!buffer_mapped(bh) || buffer_freed(bh))
1387 return 0; 1408 return 0;
1388 set_buffer_uptodate(bh); 1409 set_buffer_uptodate(bh);
1389 return ext4_journal_dirty_metadata(handle, bh); 1410 return ext4_handle_dirty_metadata(handle, NULL, bh);
1390} 1411}
1391 1412
1392/* 1413/*
@@ -1405,6 +1426,10 @@ static int ext4_ordered_write_end(struct file *file,
1405 struct inode *inode = mapping->host; 1426 struct inode *inode = mapping->host;
1406 int ret = 0, ret2; 1427 int ret = 0, ret2;
1407 1428
1429 trace_mark(ext4_ordered_write_end,
1430 "dev %s ino %lu pos %llu len %u copied %u",
1431 inode->i_sb->s_id, inode->i_ino,
1432 (unsigned long long) pos, len, copied);
1408 ret = ext4_jbd2_file_inode(handle, inode); 1433 ret = ext4_jbd2_file_inode(handle, inode);
1409 1434
1410 if (ret == 0) { 1435 if (ret == 0) {
@@ -1443,6 +1468,10 @@ static int ext4_writeback_write_end(struct file *file,
1443 int ret = 0, ret2; 1468 int ret = 0, ret2;
1444 loff_t new_i_size; 1469 loff_t new_i_size;
1445 1470
1471 trace_mark(ext4_writeback_write_end,
1472 "dev %s ino %lu pos %llu len %u copied %u",
1473 inode->i_sb->s_id, inode->i_ino,
1474 (unsigned long long) pos, len, copied);
1446 new_i_size = pos + copied; 1475 new_i_size = pos + copied;
1447 if (new_i_size > EXT4_I(inode)->i_disksize) { 1476 if (new_i_size > EXT4_I(inode)->i_disksize) {
1448 ext4_update_i_disksize(inode, new_i_size); 1477 ext4_update_i_disksize(inode, new_i_size);
@@ -1478,6 +1507,10 @@ static int ext4_journalled_write_end(struct file *file,
1478 unsigned from, to; 1507 unsigned from, to;
1479 loff_t new_i_size; 1508 loff_t new_i_size;
1480 1509
1510 trace_mark(ext4_journalled_write_end,
1511 "dev %s ino %lu pos %llu len %u copied %u",
1512 inode->i_sb->s_id, inode->i_ino,
1513 (unsigned long long) pos, len, copied);
1481 from = pos & (PAGE_CACHE_SIZE - 1); 1514 from = pos & (PAGE_CACHE_SIZE - 1);
1482 to = from + len; 1515 to = from + len;
1483 1516
@@ -1624,7 +1657,7 @@ struct mpage_da_data {
1624 get_block_t *get_block; 1657 get_block_t *get_block;
1625 struct writeback_control *wbc; 1658 struct writeback_control *wbc;
1626 int io_done; 1659 int io_done;
1627 long pages_written; 1660 int pages_written;
1628 int retval; 1661 int retval;
1629}; 1662};
1630 1663
@@ -1644,35 +1677,39 @@ struct mpage_da_data {
1644 */ 1677 */
1645static int mpage_da_submit_io(struct mpage_da_data *mpd) 1678static int mpage_da_submit_io(struct mpage_da_data *mpd)
1646{ 1679{
1647 struct address_space *mapping = mpd->inode->i_mapping;
1648 int ret = 0, err, nr_pages, i;
1649 unsigned long index, end;
1650 struct pagevec pvec;
1651 long pages_skipped; 1680 long pages_skipped;
1681 struct pagevec pvec;
1682 unsigned long index, end;
1683 int ret = 0, err, nr_pages, i;
1684 struct inode *inode = mpd->inode;
1685 struct address_space *mapping = inode->i_mapping;
1652 1686
1653 BUG_ON(mpd->next_page <= mpd->first_page); 1687 BUG_ON(mpd->next_page <= mpd->first_page);
1654 pagevec_init(&pvec, 0); 1688 /*
1689 * We need to start from the first_page to the next_page - 1
1690 * to make sure we also write the mapped dirty buffer_heads.
1691 * If we look at mpd->lbh.b_blocknr we would only be looking
1692 * at the currently mapped buffer_heads.
1693 */
1655 index = mpd->first_page; 1694 index = mpd->first_page;
1656 end = mpd->next_page - 1; 1695 end = mpd->next_page - 1;
1657 1696
1697 pagevec_init(&pvec, 0);
1658 while (index <= end) { 1698 while (index <= end) {
1659 /* 1699 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
1660 * We can use PAGECACHE_TAG_DIRTY lookup here because
1661 * even though we have cleared the dirty flag on the page
1662 * We still keep the page in the radix tree with tag
1663 * PAGECACHE_TAG_DIRTY. See clear_page_dirty_for_io.
1664 * The PAGECACHE_TAG_DIRTY is cleared in set_page_writeback
1665 * which is called via the below writepage callback.
1666 */
1667 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
1668 PAGECACHE_TAG_DIRTY,
1669 min(end - index,
1670 (pgoff_t)PAGEVEC_SIZE-1) + 1);
1671 if (nr_pages == 0) 1700 if (nr_pages == 0)
1672 break; 1701 break;
1673 for (i = 0; i < nr_pages; i++) { 1702 for (i = 0; i < nr_pages; i++) {
1674 struct page *page = pvec.pages[i]; 1703 struct page *page = pvec.pages[i];
1675 1704
1705 index = page->index;
1706 if (index > end)
1707 break;
1708 index++;
1709
1710 BUG_ON(!PageLocked(page));
1711 BUG_ON(PageWriteback(page));
1712
1676 pages_skipped = mpd->wbc->pages_skipped; 1713 pages_skipped = mpd->wbc->pages_skipped;
1677 err = mapping->a_ops->writepage(page, mpd->wbc); 1714 err = mapping->a_ops->writepage(page, mpd->wbc);
1678 if (!err && (pages_skipped == mpd->wbc->pages_skipped)) 1715 if (!err && (pages_skipped == mpd->wbc->pages_skipped))
@@ -1830,13 +1867,13 @@ static void ext4_print_free_blocks(struct inode *inode)
1830 ext4_count_free_blocks(inode->i_sb)); 1867 ext4_count_free_blocks(inode->i_sb));
1831 printk(KERN_EMERG "Free/Dirty block details\n"); 1868 printk(KERN_EMERG "Free/Dirty block details\n");
1832 printk(KERN_EMERG "free_blocks=%lld\n", 1869 printk(KERN_EMERG "free_blocks=%lld\n",
1833 percpu_counter_sum(&sbi->s_freeblocks_counter)); 1870 (long long)percpu_counter_sum(&sbi->s_freeblocks_counter));
1834 printk(KERN_EMERG "dirty_blocks=%lld\n", 1871 printk(KERN_EMERG "dirty_blocks=%lld\n",
1835 percpu_counter_sum(&sbi->s_dirtyblocks_counter)); 1872 (long long)percpu_counter_sum(&sbi->s_dirtyblocks_counter));
1836 printk(KERN_EMERG "Block reservation details\n"); 1873 printk(KERN_EMERG "Block reservation details\n");
1837 printk(KERN_EMERG "i_reserved_data_blocks=%lu\n", 1874 printk(KERN_EMERG "i_reserved_data_blocks=%u\n",
1838 EXT4_I(inode)->i_reserved_data_blocks); 1875 EXT4_I(inode)->i_reserved_data_blocks);
1839 printk(KERN_EMERG "i_reserved_meta_blocks=%lu\n", 1876 printk(KERN_EMERG "i_reserved_meta_blocks=%u\n",
1840 EXT4_I(inode)->i_reserved_meta_blocks); 1877 EXT4_I(inode)->i_reserved_meta_blocks);
1841 return; 1878 return;
1842} 1879}
@@ -2086,11 +2123,29 @@ static int __mpage_da_writepage(struct page *page,
2086 bh = head; 2123 bh = head;
2087 do { 2124 do {
2088 BUG_ON(buffer_locked(bh)); 2125 BUG_ON(buffer_locked(bh));
2126 /*
2127 * We need to try to allocate
2128 * unmapped blocks in the same page.
2129 * Otherwise we won't make progress
2130 * with the page in ext4_da_writepage
2131 */
2089 if (buffer_dirty(bh) && 2132 if (buffer_dirty(bh) &&
2090 (!buffer_mapped(bh) || buffer_delay(bh))) { 2133 (!buffer_mapped(bh) || buffer_delay(bh))) {
2091 mpage_add_bh_to_extent(mpd, logical, bh); 2134 mpage_add_bh_to_extent(mpd, logical, bh);
2092 if (mpd->io_done) 2135 if (mpd->io_done)
2093 return MPAGE_DA_EXTENT_TAIL; 2136 return MPAGE_DA_EXTENT_TAIL;
2137 } else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
2138 /*
2139 * mapped dirty buffer. We need to update
2140 * the b_state because we look at
2141 * b_state in mpage_da_map_blocks. We don't
2142 * update b_size because if we find an
2143 * unmapped buffer_head later we need to
2144 * use the b_state flag of that buffer_head.
2145 */
2146 if (mpd->lbh.b_size == 0)
2147 mpd->lbh.b_state =
2148 bh->b_state & BH_FLAGS;
2094 } 2149 }
2095 logical++; 2150 logical++;
2096 } while ((bh = bh->b_this_page) != head); 2151 } while ((bh = bh->b_this_page) != head);
@@ -2268,10 +2323,13 @@ static int ext4_da_writepage(struct page *page,
2268{ 2323{
2269 int ret = 0; 2324 int ret = 0;
2270 loff_t size; 2325 loff_t size;
2271 unsigned long len; 2326 unsigned int len;
2272 struct buffer_head *page_bufs; 2327 struct buffer_head *page_bufs;
2273 struct inode *inode = page->mapping->host; 2328 struct inode *inode = page->mapping->host;
2274 2329
2330 trace_mark(ext4_da_writepage,
2331 "dev %s ino %lu page_index %lu",
2332 inode->i_sb->s_id, inode->i_ino, page->index);
2275 size = i_size_read(inode); 2333 size = i_size_read(inode);
2276 if (page->index == size >> PAGE_CACHE_SHIFT) 2334 if (page->index == size >> PAGE_CACHE_SHIFT)
2277 len = size & ~PAGE_CACHE_MASK; 2335 len = size & ~PAGE_CACHE_MASK;
@@ -2377,10 +2435,25 @@ static int ext4_da_writepages(struct address_space *mapping,
2377 struct mpage_da_data mpd; 2435 struct mpage_da_data mpd;
2378 struct inode *inode = mapping->host; 2436 struct inode *inode = mapping->host;
2379 int no_nrwrite_index_update; 2437 int no_nrwrite_index_update;
2380 long pages_written = 0, pages_skipped; 2438 int pages_written = 0;
2439 long pages_skipped;
2381 int needed_blocks, ret = 0, nr_to_writebump = 0; 2440 int needed_blocks, ret = 0, nr_to_writebump = 0;
2382 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); 2441 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
2383 2442
2443 trace_mark(ext4_da_writepages,
2444 "dev %s ino %lu nr_t_write %ld "
2445 "pages_skipped %ld range_start %llu "
2446 "range_end %llu nonblocking %d "
2447 "for_kupdate %d for_reclaim %d "
2448 "for_writepages %d range_cyclic %d",
2449 inode->i_sb->s_id, inode->i_ino,
2450 wbc->nr_to_write, wbc->pages_skipped,
2451 (unsigned long long) wbc->range_start,
2452 (unsigned long long) wbc->range_end,
2453 wbc->nonblocking, wbc->for_kupdate,
2454 wbc->for_reclaim, wbc->for_writepages,
2455 wbc->range_cyclic);
2456
2384 /* 2457 /*
2385 * No pages to write? This is mainly a kludge to avoid starting 2458 * No pages to write? This is mainly a kludge to avoid starting
2386 * a transaction for special inodes like journal inode on last iput() 2459 * a transaction for special inodes like journal inode on last iput()
@@ -2388,6 +2461,20 @@ static int ext4_da_writepages(struct address_space *mapping,
2388 */ 2461 */
2389 if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) 2462 if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
2390 return 0; 2463 return 0;
2464
2465 /*
2466 * If the filesystem has aborted, it is read-only, so return
2467 * right away instead of dumping stack traces later on that
2468 * will obscure the real source of the problem. We test
2469 * EXT4_MOUNT_ABORT instead of sb->s_flag's MS_RDONLY because
2470 * the latter could be true if the filesystem is mounted
2471 * read-only, and in that case, ext4_da_writepages should
2472 * *never* be called, so if that ever happens, we would want
2473 * the stack trace.
2474 */
2475 if (unlikely(sbi->s_mount_opt & EXT4_MOUNT_ABORT))
2476 return -EROFS;
2477
2391 /* 2478 /*
2392 * Make sure nr_to_write is >= sbi->s_mb_stream_request 2479 * Make sure nr_to_write is >= sbi->s_mb_stream_request
2393 * This make sure small files blocks are allocated in 2480 * This make sure small files blocks are allocated in
@@ -2432,7 +2519,7 @@ static int ext4_da_writepages(struct address_space *mapping,
2432 handle = ext4_journal_start(inode, needed_blocks); 2519 handle = ext4_journal_start(inode, needed_blocks);
2433 if (IS_ERR(handle)) { 2520 if (IS_ERR(handle)) {
2434 ret = PTR_ERR(handle); 2521 ret = PTR_ERR(handle);
2435 printk(KERN_EMERG "%s: jbd2_start: " 2522 printk(KERN_CRIT "%s: jbd2_start: "
2436 "%ld pages, ino %lu; err %d\n", __func__, 2523 "%ld pages, ino %lu; err %d\n", __func__,
2437 wbc->nr_to_write, inode->i_ino, ret); 2524 wbc->nr_to_write, inode->i_ino, ret);
2438 dump_stack(); 2525 dump_stack();
@@ -2485,6 +2572,14 @@ out_writepages:
2485 if (!no_nrwrite_index_update) 2572 if (!no_nrwrite_index_update)
2486 wbc->no_nrwrite_index_update = 0; 2573 wbc->no_nrwrite_index_update = 0;
2487 wbc->nr_to_write -= nr_to_writebump; 2574 wbc->nr_to_write -= nr_to_writebump;
2575 trace_mark(ext4_da_writepage_result,
2576 "dev %s ino %lu ret %d pages_written %d "
2577 "pages_skipped %ld congestion %d "
2578 "more_io %d no_nrwrite_index_update %d",
2579 inode->i_sb->s_id, inode->i_ino, ret,
2580 pages_written, wbc->pages_skipped,
2581 wbc->encountered_congestion, wbc->more_io,
2582 wbc->no_nrwrite_index_update);
2488 return ret; 2583 return ret;
2489} 2584}
2490 2585
@@ -2497,7 +2592,7 @@ static int ext4_nonda_switch(struct super_block *sb)
2497 /* 2592 /*
2498 * switch to non delalloc mode if we are running low 2593 * switch to non delalloc mode if we are running low
2499 * on free block. The free block accounting via percpu 2594 * on free block. The free block accounting via percpu
2500 * counters can get slightly wrong with FBC_BATCH getting 2595 * counters can get slightly wrong with percpu_counter_batch getting
2501 * accumulated on each CPU without updating global counters 2596 * accumulated on each CPU without updating global counters
2502 * Delalloc need an accurate free block accounting. So switch 2597 * Delalloc need an accurate free block accounting. So switch
2503 * to non delalloc when we are near to error range. 2598 * to non delalloc when we are near to error range.
@@ -2536,6 +2631,11 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
2536 len, flags, pagep, fsdata); 2631 len, flags, pagep, fsdata);
2537 } 2632 }
2538 *fsdata = (void *)0; 2633 *fsdata = (void *)0;
2634
2635 trace_mark(ext4_da_write_begin,
2636 "dev %s ino %lu pos %llu len %u flags %u",
2637 inode->i_sb->s_id, inode->i_ino,
2638 (unsigned long long) pos, len, flags);
2539retry: 2639retry:
2540 /* 2640 /*
2541 * With delayed allocation, we don't log the i_disksize update 2641 * With delayed allocation, we don't log the i_disksize update
@@ -2549,7 +2649,7 @@ retry:
2549 goto out; 2649 goto out;
2550 } 2650 }
2551 2651
2552 page = __grab_cache_page(mapping, index); 2652 page = grab_cache_page_write_begin(mapping, index, flags);
2553 if (!page) { 2653 if (!page) {
2554 ext4_journal_stop(handle); 2654 ext4_journal_stop(handle);
2555 ret = -ENOMEM; 2655 ret = -ENOMEM;
@@ -2625,6 +2725,10 @@ static int ext4_da_write_end(struct file *file,
2625 } 2725 }
2626 } 2726 }
2627 2727
2728 trace_mark(ext4_da_write_end,
2729 "dev %s ino %lu pos %llu len %u copied %u",
2730 inode->i_sb->s_id, inode->i_ino,
2731 (unsigned long long) pos, len, copied);
2628 start = pos & (PAGE_CACHE_SIZE - 1); 2732 start = pos & (PAGE_CACHE_SIZE - 1);
2629 end = start + copied - 1; 2733 end = start + copied - 1;
2630 2734
@@ -2717,7 +2821,7 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
2717 filemap_write_and_wait(mapping); 2821 filemap_write_and_wait(mapping);
2718 } 2822 }
2719 2823
2720 if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) { 2824 if (EXT4_JOURNAL(inode) && EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
2721 /* 2825 /*
2722 * This is a REALLY heavyweight approach, but the use of 2826 * This is a REALLY heavyweight approach, but the use of
2723 * bmap on dirty files is expected to be extremely rare: 2827 * bmap on dirty files is expected to be extremely rare:
@@ -2835,6 +2939,9 @@ static int ext4_normal_writepage(struct page *page,
2835 loff_t size = i_size_read(inode); 2939 loff_t size = i_size_read(inode);
2836 loff_t len; 2940 loff_t len;
2837 2941
2942 trace_mark(ext4_normal_writepage,
2943 "dev %s ino %lu page_index %lu",
2944 inode->i_sb->s_id, inode->i_ino, page->index);
2838 J_ASSERT(PageLocked(page)); 2945 J_ASSERT(PageLocked(page));
2839 if (page->index == size >> PAGE_CACHE_SHIFT) 2946 if (page->index == size >> PAGE_CACHE_SHIFT)
2840 len = size & ~PAGE_CACHE_MASK; 2947 len = size & ~PAGE_CACHE_MASK;
@@ -2920,6 +3027,9 @@ static int ext4_journalled_writepage(struct page *page,
2920 loff_t size = i_size_read(inode); 3027 loff_t size = i_size_read(inode);
2921 loff_t len; 3028 loff_t len;
2922 3029
3030 trace_mark(ext4_journalled_writepage,
3031 "dev %s ino %lu page_index %lu",
3032 inode->i_sb->s_id, inode->i_ino, page->index);
2923 J_ASSERT(PageLocked(page)); 3033 J_ASSERT(PageLocked(page));
2924 if (page->index == size >> PAGE_CACHE_SHIFT) 3034 if (page->index == size >> PAGE_CACHE_SHIFT)
2925 len = size & ~PAGE_CACHE_MASK; 3035 len = size & ~PAGE_CACHE_MASK;
@@ -2988,7 +3098,10 @@ static void ext4_invalidatepage(struct page *page, unsigned long offset)
2988 if (offset == 0) 3098 if (offset == 0)
2989 ClearPageChecked(page); 3099 ClearPageChecked(page);
2990 3100
2991 jbd2_journal_invalidatepage(journal, page, offset); 3101 if (journal)
3102 jbd2_journal_invalidatepage(journal, page, offset);
3103 else
3104 block_invalidatepage(page, offset);
2992} 3105}
2993 3106
2994static int ext4_releasepage(struct page *page, gfp_t wait) 3107static int ext4_releasepage(struct page *page, gfp_t wait)
@@ -2998,7 +3111,10 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
2998 WARN_ON(PageChecked(page)); 3111 WARN_ON(PageChecked(page));
2999 if (!page_has_buffers(page)) 3112 if (!page_has_buffers(page))
3000 return 0; 3113 return 0;
3001 return jbd2_journal_try_to_free_buffers(journal, page, wait); 3114 if (journal)
3115 return jbd2_journal_try_to_free_buffers(journal, page, wait);
3116 else
3117 return try_to_free_buffers(page);
3002} 3118}
3003 3119
3004/* 3120/*
@@ -3270,7 +3386,7 @@ int ext4_block_truncate_page(handle_t *handle,
3270 3386
3271 err = 0; 3387 err = 0;
3272 if (ext4_should_journal_data(inode)) { 3388 if (ext4_should_journal_data(inode)) {
3273 err = ext4_journal_dirty_metadata(handle, bh); 3389 err = ext4_handle_dirty_metadata(handle, inode, bh);
3274 } else { 3390 } else {
3275 if (ext4_should_order_data(inode)) 3391 if (ext4_should_order_data(inode))
3276 err = ext4_jbd2_file_inode(handle, inode); 3392 err = ext4_jbd2_file_inode(handle, inode);
@@ -3394,8 +3510,8 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
3394 __le32 *p; 3510 __le32 *p;
3395 if (try_to_extend_transaction(handle, inode)) { 3511 if (try_to_extend_transaction(handle, inode)) {
3396 if (bh) { 3512 if (bh) {
3397 BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); 3513 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
3398 ext4_journal_dirty_metadata(handle, bh); 3514 ext4_handle_dirty_metadata(handle, inode, bh);
3399 } 3515 }
3400 ext4_mark_inode_dirty(handle, inode); 3516 ext4_mark_inode_dirty(handle, inode);
3401 ext4_journal_test_restart(handle, inode); 3517 ext4_journal_test_restart(handle, inode);
@@ -3495,7 +3611,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
3495 count, block_to_free_p, p); 3611 count, block_to_free_p, p);
3496 3612
3497 if (this_bh) { 3613 if (this_bh) {
3498 BUFFER_TRACE(this_bh, "call ext4_journal_dirty_metadata"); 3614 BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata");
3499 3615
3500 /* 3616 /*
3501 * The buffer head should have an attached journal head at this 3617 * The buffer head should have an attached journal head at this
@@ -3503,8 +3619,8 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
3503 * block pointed to itself, it would have been detached when 3619 * block pointed to itself, it would have been detached when
3504 * the block was cleared. Check for this instead of OOPSing. 3620 * the block was cleared. Check for this instead of OOPSing.
3505 */ 3621 */
3506 if (bh2jh(this_bh)) 3622 if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh))
3507 ext4_journal_dirty_metadata(handle, this_bh); 3623 ext4_handle_dirty_metadata(handle, inode, this_bh);
3508 else 3624 else
3509 ext4_error(inode->i_sb, __func__, 3625 ext4_error(inode->i_sb, __func__,
3510 "circular indirect block detected, " 3626 "circular indirect block detected, "
@@ -3534,7 +3650,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
3534 ext4_fsblk_t nr; 3650 ext4_fsblk_t nr;
3535 __le32 *p; 3651 __le32 *p;
3536 3652
3537 if (is_handle_aborted(handle)) 3653 if (ext4_handle_is_aborted(handle))
3538 return; 3654 return;
3539 3655
3540 if (depth--) { 3656 if (depth--) {
@@ -3604,7 +3720,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
3604 * will merely complain about releasing a free block, 3720 * will merely complain about releasing a free block,
3605 * rather than leaking blocks. 3721 * rather than leaking blocks.
3606 */ 3722 */
3607 if (is_handle_aborted(handle)) 3723 if (ext4_handle_is_aborted(handle))
3608 return; 3724 return;
3609 if (try_to_extend_transaction(handle, inode)) { 3725 if (try_to_extend_transaction(handle, inode)) {
3610 ext4_mark_inode_dirty(handle, inode); 3726 ext4_mark_inode_dirty(handle, inode);
@@ -3623,9 +3739,10 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
3623 parent_bh)){ 3739 parent_bh)){
3624 *p = 0; 3740 *p = 0;
3625 BUFFER_TRACE(parent_bh, 3741 BUFFER_TRACE(parent_bh,
3626 "call ext4_journal_dirty_metadata"); 3742 "call ext4_handle_dirty_metadata");
3627 ext4_journal_dirty_metadata(handle, 3743 ext4_handle_dirty_metadata(handle,
3628 parent_bh); 3744 inode,
3745 parent_bh);
3629 } 3746 }
3630 } 3747 }
3631 } 3748 }
@@ -3813,7 +3930,7 @@ do_indirects:
3813 * synchronous 3930 * synchronous
3814 */ 3931 */
3815 if (IS_SYNC(inode)) 3932 if (IS_SYNC(inode))
3816 handle->h_sync = 1; 3933 ext4_handle_sync(handle);
3817out_stop: 3934out_stop:
3818 /* 3935 /*
3819 * If this was a simple ftruncate(), and the file will remain alive 3936 * If this was a simple ftruncate(), and the file will remain alive
@@ -3843,7 +3960,7 @@ static int __ext4_get_inode_loc(struct inode *inode,
3843 ext4_fsblk_t block; 3960 ext4_fsblk_t block;
3844 int inodes_per_block, inode_offset; 3961 int inodes_per_block, inode_offset;
3845 3962
3846 iloc->bh = 0; 3963 iloc->bh = NULL;
3847 if (!ext4_valid_inum(sb, inode->i_ino)) 3964 if (!ext4_valid_inum(sb, inode->i_ino))
3848 return -EIO; 3965 return -EIO;
3849 3966
@@ -3950,7 +4067,7 @@ make_io:
3950 num = EXT4_INODES_PER_GROUP(sb); 4067 num = EXT4_INODES_PER_GROUP(sb);
3951 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, 4068 if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
3952 EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) 4069 EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
3953 num -= le16_to_cpu(gdp->bg_itable_unused); 4070 num -= ext4_itable_unused_count(sb, gdp);
3954 table += num / inodes_per_block; 4071 table += num / inodes_per_block;
3955 if (end > table) 4072 if (end > table)
3956 end = table; 4073 end = table;
@@ -4164,9 +4281,11 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4164 inode->i_op = &ext4_dir_inode_operations; 4281 inode->i_op = &ext4_dir_inode_operations;
4165 inode->i_fop = &ext4_dir_operations; 4282 inode->i_fop = &ext4_dir_operations;
4166 } else if (S_ISLNK(inode->i_mode)) { 4283 } else if (S_ISLNK(inode->i_mode)) {
4167 if (ext4_inode_is_fast_symlink(inode)) 4284 if (ext4_inode_is_fast_symlink(inode)) {
4168 inode->i_op = &ext4_fast_symlink_inode_operations; 4285 inode->i_op = &ext4_fast_symlink_inode_operations;
4169 else { 4286 nd_terminate_link(ei->i_data, inode->i_size,
4287 sizeof(ei->i_data) - 1);
4288 } else {
4170 inode->i_op = &ext4_symlink_inode_operations; 4289 inode->i_op = &ext4_symlink_inode_operations;
4171 ext4_set_aops(inode); 4290 ext4_set_aops(inode);
4172 } 4291 }
@@ -4310,8 +4429,8 @@ static int ext4_do_update_inode(handle_t *handle,
4310 EXT4_SET_RO_COMPAT_FEATURE(sb, 4429 EXT4_SET_RO_COMPAT_FEATURE(sb,
4311 EXT4_FEATURE_RO_COMPAT_LARGE_FILE); 4430 EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
4312 sb->s_dirt = 1; 4431 sb->s_dirt = 1;
4313 handle->h_sync = 1; 4432 ext4_handle_sync(handle);
4314 err = ext4_journal_dirty_metadata(handle, 4433 err = ext4_handle_dirty_metadata(handle, inode,
4315 EXT4_SB(sb)->s_sbh); 4434 EXT4_SB(sb)->s_sbh);
4316 } 4435 }
4317 } 4436 }
@@ -4338,9 +4457,8 @@ static int ext4_do_update_inode(handle_t *handle,
4338 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); 4457 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
4339 } 4458 }
4340 4459
4341 4460 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
4342 BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); 4461 rc = ext4_handle_dirty_metadata(handle, inode, bh);
4343 rc = ext4_journal_dirty_metadata(handle, bh);
4344 if (!err) 4462 if (!err)
4345 err = rc; 4463 err = rc;
4346 ei->i_state &= ~EXT4_STATE_NEW; 4464 ei->i_state &= ~EXT4_STATE_NEW;
@@ -4403,6 +4521,25 @@ int ext4_write_inode(struct inode *inode, int wait)
4403 return ext4_force_commit(inode->i_sb); 4521 return ext4_force_commit(inode->i_sb);
4404} 4522}
4405 4523
4524int __ext4_write_dirty_metadata(struct inode *inode, struct buffer_head *bh)
4525{
4526 int err = 0;
4527
4528 mark_buffer_dirty(bh);
4529 if (inode && inode_needs_sync(inode)) {
4530 sync_dirty_buffer(bh);
4531 if (buffer_req(bh) && !buffer_uptodate(bh)) {
4532 ext4_error(inode->i_sb, __func__,
4533 "IO error syncing inode, "
4534 "inode=%lu, block=%llu",
4535 inode->i_ino,
4536 (unsigned long long)bh->b_blocknr);
4537 err = -EIO;
4538 }
4539 }
4540 return err;
4541}
4542
4406/* 4543/*
4407 * ext4_setattr() 4544 * ext4_setattr()
4408 * 4545 *
@@ -4707,16 +4844,15 @@ int
4707ext4_reserve_inode_write(handle_t *handle, struct inode *inode, 4844ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
4708 struct ext4_iloc *iloc) 4845 struct ext4_iloc *iloc)
4709{ 4846{
4710 int err = 0; 4847 int err;
4711 if (handle) { 4848
4712 err = ext4_get_inode_loc(inode, iloc); 4849 err = ext4_get_inode_loc(inode, iloc);
4713 if (!err) { 4850 if (!err) {
4714 BUFFER_TRACE(iloc->bh, "get_write_access"); 4851 BUFFER_TRACE(iloc->bh, "get_write_access");
4715 err = ext4_journal_get_write_access(handle, iloc->bh); 4852 err = ext4_journal_get_write_access(handle, iloc->bh);
4716 if (err) { 4853 if (err) {
4717 brelse(iloc->bh); 4854 brelse(iloc->bh);
4718 iloc->bh = NULL; 4855 iloc->bh = NULL;
4719 }
4720 } 4856 }
4721 } 4857 }
4722 ext4_std_error(inode->i_sb, err); 4858 ext4_std_error(inode->i_sb, err);
@@ -4788,7 +4924,8 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
4788 4924
4789 might_sleep(); 4925 might_sleep();
4790 err = ext4_reserve_inode_write(handle, inode, &iloc); 4926 err = ext4_reserve_inode_write(handle, inode, &iloc);
4791 if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && 4927 if (ext4_handle_valid(handle) &&
4928 EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
4792 !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) { 4929 !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) {
4793 /* 4930 /*
4794 * We need extra buffer credits since we may write into EA block 4931 * We need extra buffer credits since we may write into EA block
@@ -4840,6 +4977,11 @@ void ext4_dirty_inode(struct inode *inode)
4840 handle_t *current_handle = ext4_journal_current_handle(); 4977 handle_t *current_handle = ext4_journal_current_handle();
4841 handle_t *handle; 4978 handle_t *handle;
4842 4979
4980 if (!ext4_handle_valid(current_handle)) {
4981 ext4_mark_inode_dirty(current_handle, inode);
4982 return;
4983 }
4984
4843 handle = ext4_journal_start(inode, 2); 4985 handle = ext4_journal_start(inode, 2);
4844 if (IS_ERR(handle)) 4986 if (IS_ERR(handle))
4845 goto out; 4987 goto out;
@@ -4877,8 +5019,9 @@ static int ext4_pin_inode(handle_t *handle, struct inode *inode)
4877 BUFFER_TRACE(iloc.bh, "get_write_access"); 5019 BUFFER_TRACE(iloc.bh, "get_write_access");
4878 err = jbd2_journal_get_write_access(handle, iloc.bh); 5020 err = jbd2_journal_get_write_access(handle, iloc.bh);
4879 if (!err) 5021 if (!err)
4880 err = ext4_journal_dirty_metadata(handle, 5022 err = ext4_handle_dirty_metadata(handle,
4881 iloc.bh); 5023 inode,
5024 iloc.bh);
4882 brelse(iloc.bh); 5025 brelse(iloc.bh);
4883 } 5026 }
4884 } 5027 }
@@ -4904,6 +5047,8 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
4904 */ 5047 */
4905 5048
4906 journal = EXT4_JOURNAL(inode); 5049 journal = EXT4_JOURNAL(inode);
5050 if (!journal)
5051 return 0;
4907 if (is_journal_aborted(journal)) 5052 if (is_journal_aborted(journal))
4908 return -EROFS; 5053 return -EROFS;
4909 5054
@@ -4933,7 +5078,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
4933 return PTR_ERR(handle); 5078 return PTR_ERR(handle);
4934 5079
4935 err = ext4_mark_inode_dirty(handle, inode); 5080 err = ext4_mark_inode_dirty(handle, inode);
4936 handle->h_sync = 1; 5081 ext4_handle_sync(handle);
4937 ext4_journal_stop(handle); 5082 ext4_journal_stop(handle);
4938 ext4_std_error(inode->i_sb, err); 5083 ext4_std_error(inode->i_sb, err);
4939 5084
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index dc99b4776d58..42dc83fb247a 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -99,7 +99,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
99 goto flags_out; 99 goto flags_out;
100 } 100 }
101 if (IS_SYNC(inode)) 101 if (IS_SYNC(inode))
102 handle->h_sync = 1; 102 ext4_handle_sync(handle);
103 err = ext4_reserve_inode_write(handle, inode, &iloc); 103 err = ext4_reserve_inode_write(handle, inode, &iloc);
104 if (err) 104 if (err)
105 goto flags_err; 105 goto flags_err;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 444ad998f72e..deba54f6cbed 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -100,7 +100,7 @@
100 * inode as: 100 * inode as:
101 * 101 *
102 * { page } 102 * { page }
103 * [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]... 103 * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
104 * 104 *
105 * 105 *
106 * one block each for bitmap and buddy information. So for each group we 106 * one block each for bitmap and buddy information. So for each group we
@@ -330,6 +330,18 @@
330 * object 330 * object
331 * 331 *
332 */ 332 */
333static struct kmem_cache *ext4_pspace_cachep;
334static struct kmem_cache *ext4_ac_cachep;
335static struct kmem_cache *ext4_free_ext_cachep;
336static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
337 ext4_group_t group);
338static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
339 ext4_group_t group);
340static int ext4_mb_init_per_dev_proc(struct super_block *sb);
341static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
342static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
343
344
333 345
334static inline void *mb_correct_addr_and_bit(int *bit, void *addr) 346static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
335{ 347{
@@ -445,9 +457,9 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
445 blocknr += first + i; 457 blocknr += first + i;
446 blocknr += 458 blocknr +=
447 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); 459 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
448 460 ext4_grp_locked_error(sb, e4b->bd_group,
449 ext4_error(sb, __func__, "double-free of inode" 461 __func__, "double-free of inode"
450 " %lu's block %llu(bit %u in group %lu)\n", 462 " %lu's block %llu(bit %u in group %u)",
451 inode ? inode->i_ino : 0, blocknr, 463 inode ? inode->i_ino : 0, blocknr,
452 first + i, e4b->bd_group); 464 first + i, e4b->bd_group);
453 } 465 }
@@ -477,7 +489,7 @@ static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
477 b2 = (unsigned char *) bitmap; 489 b2 = (unsigned char *) bitmap;
478 for (i = 0; i < e4b->bd_sb->s_blocksize; i++) { 490 for (i = 0; i < e4b->bd_sb->s_blocksize; i++) {
479 if (b1[i] != b2[i]) { 491 if (b1[i] != b2[i]) {
480 printk(KERN_ERR "corruption in group %lu " 492 printk(KERN_ERR "corruption in group %u "
481 "at byte %u(%u): %x in copy != %x " 493 "at byte %u(%u): %x in copy != %x "
482 "on disk/prealloc\n", 494 "on disk/prealloc\n",
483 e4b->bd_group, i, i * 8, b1[i], b2[i]); 495 e4b->bd_group, i, i * 8, b1[i], b2[i]);
@@ -690,8 +702,8 @@ static void ext4_mb_generate_buddy(struct super_block *sb,
690 grp->bb_fragments = fragments; 702 grp->bb_fragments = fragments;
691 703
692 if (free != grp->bb_free) { 704 if (free != grp->bb_free) {
693 ext4_error(sb, __func__, 705 ext4_grp_locked_error(sb, group, __func__,
694 "EXT4-fs: group %lu: %u blocks in bitmap, %u in gd\n", 706 "EXT4-fs: group %u: %u blocks in bitmap, %u in gd",
695 group, free, grp->bb_free); 707 group, free, grp->bb_free);
696 /* 708 /*
697 * If we intent to continue, we consider group descritor 709 * If we intent to continue, we consider group descritor
@@ -716,7 +728,7 @@ static void ext4_mb_generate_buddy(struct super_block *sb,
716 * stored in the inode as 728 * stored in the inode as
717 * 729 *
718 * { page } 730 * { page }
719 * [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]... 731 * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
720 * 732 *
721 * 733 *
722 * one block each for bitmap and buddy information. 734 * one block each for bitmap and buddy information.
@@ -782,25 +794,45 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
782 if (bh[i] == NULL) 794 if (bh[i] == NULL)
783 goto out; 795 goto out;
784 796
785 if (buffer_uptodate(bh[i]) && 797 if (bitmap_uptodate(bh[i]))
786 !(desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))
787 continue; 798 continue;
788 799
789 lock_buffer(bh[i]); 800 lock_buffer(bh[i]);
801 if (bitmap_uptodate(bh[i])) {
802 unlock_buffer(bh[i]);
803 continue;
804 }
790 spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); 805 spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
791 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 806 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
792 ext4_init_block_bitmap(sb, bh[i], 807 ext4_init_block_bitmap(sb, bh[i],
793 first_group + i, desc); 808 first_group + i, desc);
809 set_bitmap_uptodate(bh[i]);
794 set_buffer_uptodate(bh[i]); 810 set_buffer_uptodate(bh[i]);
795 unlock_buffer(bh[i]);
796 spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); 811 spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
812 unlock_buffer(bh[i]);
797 continue; 813 continue;
798 } 814 }
799 spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); 815 spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
816 if (buffer_uptodate(bh[i])) {
817 /*
818 * if not uninit if bh is uptodate,
819 * bitmap is also uptodate
820 */
821 set_bitmap_uptodate(bh[i]);
822 unlock_buffer(bh[i]);
823 continue;
824 }
800 get_bh(bh[i]); 825 get_bh(bh[i]);
826 /*
827 * submit the buffer_head for read. We can
828 * safely mark the bitmap as uptodate now.
829 * We do it here so the bitmap uptodate bit
830 * get set with buffer lock held.
831 */
832 set_bitmap_uptodate(bh[i]);
801 bh[i]->b_end_io = end_buffer_read_sync; 833 bh[i]->b_end_io = end_buffer_read_sync;
802 submit_bh(READ, bh[i]); 834 submit_bh(READ, bh[i]);
803 mb_debug("read bitmap for group %lu\n", first_group + i); 835 mb_debug("read bitmap for group %u\n", first_group + i);
804 } 836 }
805 837
806 /* wait for I/O completion */ 838 /* wait for I/O completion */
@@ -814,6 +846,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
814 846
815 err = 0; 847 err = 0;
816 first_block = page->index * blocks_per_page; 848 first_block = page->index * blocks_per_page;
849 /* init the page */
850 memset(page_address(page), 0xff, PAGE_CACHE_SIZE);
817 for (i = 0; i < blocks_per_page; i++) { 851 for (i = 0; i < blocks_per_page; i++) {
818 int group; 852 int group;
819 struct ext4_group_info *grinfo; 853 struct ext4_group_info *grinfo;
@@ -840,7 +874,6 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
840 BUG_ON(incore == NULL); 874 BUG_ON(incore == NULL);
841 mb_debug("put buddy for group %u in page %lu/%x\n", 875 mb_debug("put buddy for group %u in page %lu/%x\n",
842 group, page->index, i * blocksize); 876 group, page->index, i * blocksize);
843 memset(data, 0xff, blocksize);
844 grinfo = ext4_get_group_info(sb, group); 877 grinfo = ext4_get_group_info(sb, group);
845 grinfo->bb_fragments = 0; 878 grinfo->bb_fragments = 0;
846 memset(grinfo->bb_counters, 0, 879 memset(grinfo->bb_counters, 0,
@@ -848,7 +881,9 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
848 /* 881 /*
849 * incore got set to the group block bitmap below 882 * incore got set to the group block bitmap below
850 */ 883 */
884 ext4_lock_group(sb, group);
851 ext4_mb_generate_buddy(sb, data, incore, group); 885 ext4_mb_generate_buddy(sb, data, incore, group);
886 ext4_unlock_group(sb, group);
852 incore = NULL; 887 incore = NULL;
853 } else { 888 } else {
854 /* this is block of bitmap */ 889 /* this is block of bitmap */
@@ -862,6 +897,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
862 897
863 /* mark all preallocated blks used in in-core bitmap */ 898 /* mark all preallocated blks used in in-core bitmap */
864 ext4_mb_generate_from_pa(sb, data, group); 899 ext4_mb_generate_from_pa(sb, data, group);
900 ext4_mb_generate_from_freelist(sb, data, group);
865 ext4_unlock_group(sb, group); 901 ext4_unlock_group(sb, group);
866 902
867 /* set incore so that the buddy information can be 903 /* set incore so that the buddy information can be
@@ -886,18 +922,20 @@ static noinline_for_stack int
886ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, 922ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
887 struct ext4_buddy *e4b) 923 struct ext4_buddy *e4b)
888{ 924{
889 struct ext4_sb_info *sbi = EXT4_SB(sb);
890 struct inode *inode = sbi->s_buddy_cache;
891 int blocks_per_page; 925 int blocks_per_page;
892 int block; 926 int block;
893 int pnum; 927 int pnum;
894 int poff; 928 int poff;
895 struct page *page; 929 struct page *page;
896 int ret; 930 int ret;
931 struct ext4_group_info *grp;
932 struct ext4_sb_info *sbi = EXT4_SB(sb);
933 struct inode *inode = sbi->s_buddy_cache;
897 934
898 mb_debug("load group %lu\n", group); 935 mb_debug("load group %u\n", group);
899 936
900 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; 937 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
938 grp = ext4_get_group_info(sb, group);
901 939
902 e4b->bd_blkbits = sb->s_blocksize_bits; 940 e4b->bd_blkbits = sb->s_blocksize_bits;
903 e4b->bd_info = ext4_get_group_info(sb, group); 941 e4b->bd_info = ext4_get_group_info(sb, group);
@@ -905,6 +943,15 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
905 e4b->bd_group = group; 943 e4b->bd_group = group;
906 e4b->bd_buddy_page = NULL; 944 e4b->bd_buddy_page = NULL;
907 e4b->bd_bitmap_page = NULL; 945 e4b->bd_bitmap_page = NULL;
946 e4b->alloc_semp = &grp->alloc_sem;
947
948 /* Take the read lock on the group alloc
949 * sem. This would make sure a parallel
950 * ext4_mb_init_group happening on other
951 * groups mapped by the page is blocked
952 * till we are done with allocation
953 */
954 down_read(e4b->alloc_semp);
908 955
909 /* 956 /*
910 * the buddy cache inode stores the block bitmap 957 * the buddy cache inode stores the block bitmap
@@ -920,6 +967,14 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
920 page = find_get_page(inode->i_mapping, pnum); 967 page = find_get_page(inode->i_mapping, pnum);
921 if (page == NULL || !PageUptodate(page)) { 968 if (page == NULL || !PageUptodate(page)) {
922 if (page) 969 if (page)
970 /*
971 * drop the page reference and try
972 * to get the page with lock. If we
973 * are not uptodate that implies
974 * somebody just created the page but
975 * is yet to initialize the same. So
976 * wait for it to initialize.
977 */
923 page_cache_release(page); 978 page_cache_release(page);
924 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); 979 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
925 if (page) { 980 if (page) {
@@ -985,6 +1040,9 @@ err:
985 page_cache_release(e4b->bd_buddy_page); 1040 page_cache_release(e4b->bd_buddy_page);
986 e4b->bd_buddy = NULL; 1041 e4b->bd_buddy = NULL;
987 e4b->bd_bitmap = NULL; 1042 e4b->bd_bitmap = NULL;
1043
1044 /* Done with the buddy cache */
1045 up_read(e4b->alloc_semp);
988 return ret; 1046 return ret;
989} 1047}
990 1048
@@ -994,6 +1052,9 @@ static void ext4_mb_release_desc(struct ext4_buddy *e4b)
994 page_cache_release(e4b->bd_bitmap_page); 1052 page_cache_release(e4b->bd_bitmap_page);
995 if (e4b->bd_buddy_page) 1053 if (e4b->bd_buddy_page)
996 page_cache_release(e4b->bd_buddy_page); 1054 page_cache_release(e4b->bd_buddy_page);
1055 /* Done with the buddy cache */
1056 if (e4b->alloc_semp)
1057 up_read(e4b->alloc_semp);
997} 1058}
998 1059
999 1060
@@ -1031,7 +1092,10 @@ static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len)
1031 cur += 32; 1092 cur += 32;
1032 continue; 1093 continue;
1033 } 1094 }
1034 mb_clear_bit_atomic(lock, cur, bm); 1095 if (lock)
1096 mb_clear_bit_atomic(lock, cur, bm);
1097 else
1098 mb_clear_bit(cur, bm);
1035 cur++; 1099 cur++;
1036 } 1100 }
1037} 1101}
@@ -1049,7 +1113,10 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
1049 cur += 32; 1113 cur += 32;
1050 continue; 1114 continue;
1051 } 1115 }
1052 mb_set_bit_atomic(lock, cur, bm); 1116 if (lock)
1117 mb_set_bit_atomic(lock, cur, bm);
1118 else
1119 mb_set_bit(cur, bm);
1053 cur++; 1120 cur++;
1054 } 1121 }
1055} 1122}
@@ -1094,12 +1161,11 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1094 blocknr += block; 1161 blocknr += block;
1095 blocknr += 1162 blocknr +=
1096 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); 1163 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
1097 ext4_unlock_group(sb, e4b->bd_group); 1164 ext4_grp_locked_error(sb, e4b->bd_group,
1098 ext4_error(sb, __func__, "double-free of inode" 1165 __func__, "double-free of inode"
1099 " %lu's block %llu(bit %u in group %lu)\n", 1166 " %lu's block %llu(bit %u in group %u)",
1100 inode ? inode->i_ino : 0, blocknr, block, 1167 inode ? inode->i_ino : 0, blocknr, block,
1101 e4b->bd_group); 1168 e4b->bd_group);
1102 ext4_lock_group(sb, e4b->bd_group);
1103 } 1169 }
1104 mb_clear_bit(block, EXT4_MB_BITMAP(e4b)); 1170 mb_clear_bit(block, EXT4_MB_BITMAP(e4b));
1105 e4b->bd_info->bb_counters[order]++; 1171 e4b->bd_info->bb_counters[order]++;
@@ -1296,13 +1362,20 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
1296 ac->ac_tail = ret & 0xffff; 1362 ac->ac_tail = ret & 0xffff;
1297 ac->ac_buddy = ret >> 16; 1363 ac->ac_buddy = ret >> 16;
1298 1364
1299 /* XXXXXXX: SUCH A HORRIBLE **CK */ 1365 /*
1300 /*FIXME!! Why ? */ 1366 * take the page reference. We want the page to be pinned
1367 * so that we don't get a ext4_mb_init_cache_call for this
1368 * group until we update the bitmap. That would mean we
1369 * double allocate blocks. The reference is dropped
1370 * in ext4_mb_release_context
1371 */
1301 ac->ac_bitmap_page = e4b->bd_bitmap_page; 1372 ac->ac_bitmap_page = e4b->bd_bitmap_page;
1302 get_page(ac->ac_bitmap_page); 1373 get_page(ac->ac_bitmap_page);
1303 ac->ac_buddy_page = e4b->bd_buddy_page; 1374 ac->ac_buddy_page = e4b->bd_buddy_page;
1304 get_page(ac->ac_buddy_page); 1375 get_page(ac->ac_buddy_page);
1305 1376 /* on allocation we use ac to track the held semaphore */
1377 ac->alloc_semp = e4b->alloc_semp;
1378 e4b->alloc_semp = NULL;
1306 /* store last allocated for subsequent stream allocation */ 1379 /* store last allocated for subsequent stream allocation */
1307 if ((ac->ac_flags & EXT4_MB_HINT_DATA)) { 1380 if ((ac->ac_flags & EXT4_MB_HINT_DATA)) {
1308 spin_lock(&sbi->s_md_lock); 1381 spin_lock(&sbi->s_md_lock);
@@ -1326,6 +1399,8 @@ static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
1326 struct ext4_free_extent ex; 1399 struct ext4_free_extent ex;
1327 int max; 1400 int max;
1328 1401
1402 if (ac->ac_status == AC_STATUS_FOUND)
1403 return;
1329 /* 1404 /*
1330 * We don't want to scan for a whole year 1405 * We don't want to scan for a whole year
1331 */ 1406 */
@@ -1575,8 +1650,9 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
1575 * free blocks even though group info says we 1650 * free blocks even though group info says we
1576 * we have free blocks 1651 * we have free blocks
1577 */ 1652 */
1578 ext4_error(sb, __func__, "%d free blocks as per " 1653 ext4_grp_locked_error(sb, e4b->bd_group,
1579 "group info. But bitmap says 0\n", 1654 __func__, "%d free blocks as per "
1655 "group info. But bitmap says 0",
1580 free); 1656 free);
1581 break; 1657 break;
1582 } 1658 }
@@ -1584,8 +1660,9 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
1584 mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex); 1660 mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex);
1585 BUG_ON(ex.fe_len <= 0); 1661 BUG_ON(ex.fe_len <= 0);
1586 if (free < ex.fe_len) { 1662 if (free < ex.fe_len) {
1587 ext4_error(sb, __func__, "%d free blocks as per " 1663 ext4_grp_locked_error(sb, e4b->bd_group,
1588 "group info. But got %d blocks\n", 1664 __func__, "%d free blocks as per "
1665 "group info. But got %d blocks",
1589 free, ex.fe_len); 1666 free, ex.fe_len);
1590 /* 1667 /*
1591 * The number of free blocks differs. This mostly 1668 * The number of free blocks differs. This mostly
@@ -1692,6 +1769,173 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
1692 return 0; 1769 return 0;
1693} 1770}
1694 1771
1772/*
1773 * lock the group_info alloc_sem of all the groups
1774 * belonging to the same buddy cache page. This
1775 * make sure other parallel operation on the buddy
1776 * cache doesn't happen whild holding the buddy cache
1777 * lock
1778 */
1779int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group)
1780{
1781 int i;
1782 int block, pnum;
1783 int blocks_per_page;
1784 int groups_per_page;
1785 ext4_group_t first_group;
1786 struct ext4_group_info *grp;
1787
1788 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1789 /*
1790 * the buddy cache inode stores the block bitmap
1791 * and buddy information in consecutive blocks.
1792 * So for each group we need two blocks.
1793 */
1794 block = group * 2;
1795 pnum = block / blocks_per_page;
1796 first_group = pnum * blocks_per_page / 2;
1797
1798 groups_per_page = blocks_per_page >> 1;
1799 if (groups_per_page == 0)
1800 groups_per_page = 1;
1801 /* read all groups the page covers into the cache */
1802 for (i = 0; i < groups_per_page; i++) {
1803
1804 if ((first_group + i) >= EXT4_SB(sb)->s_groups_count)
1805 break;
1806 grp = ext4_get_group_info(sb, first_group + i);
1807 /* take all groups write allocation
1808 * semaphore. This make sure there is
1809 * no block allocation going on in any
1810 * of that groups
1811 */
1812 down_write_nested(&grp->alloc_sem, i);
1813 }
1814 return i;
1815}
1816
1817void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
1818 ext4_group_t group, int locked_group)
1819{
1820 int i;
1821 int block, pnum;
1822 int blocks_per_page;
1823 ext4_group_t first_group;
1824 struct ext4_group_info *grp;
1825
1826 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1827 /*
1828 * the buddy cache inode stores the block bitmap
1829 * and buddy information in consecutive blocks.
1830 * So for each group we need two blocks.
1831 */
1832 block = group * 2;
1833 pnum = block / blocks_per_page;
1834 first_group = pnum * blocks_per_page / 2;
1835 /* release locks on all the groups */
1836 for (i = 0; i < locked_group; i++) {
1837
1838 grp = ext4_get_group_info(sb, first_group + i);
1839 /* take all groups write allocation
1840 * semaphore. This make sure there is
1841 * no block allocation going on in any
1842 * of that groups
1843 */
1844 up_write(&grp->alloc_sem);
1845 }
1846
1847}
1848
1849static int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
1850{
1851
1852 int ret;
1853 void *bitmap;
1854 int blocks_per_page;
1855 int block, pnum, poff;
1856 int num_grp_locked = 0;
1857 struct ext4_group_info *this_grp;
1858 struct ext4_sb_info *sbi = EXT4_SB(sb);
1859 struct inode *inode = sbi->s_buddy_cache;
1860 struct page *page = NULL, *bitmap_page = NULL;
1861
1862 mb_debug("init group %lu\n", group);
1863 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1864 this_grp = ext4_get_group_info(sb, group);
1865 /*
1866 * This ensures we don't add group
1867 * to this buddy cache via resize
1868 */
1869 num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group);
1870 if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
1871 /*
1872 * somebody initialized the group
1873 * return without doing anything
1874 */
1875 ret = 0;
1876 goto err;
1877 }
1878 /*
1879 * the buddy cache inode stores the block bitmap
1880 * and buddy information in consecutive blocks.
1881 * So for each group we need two blocks.
1882 */
1883 block = group * 2;
1884 pnum = block / blocks_per_page;
1885 poff = block % blocks_per_page;
1886 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1887 if (page) {
1888 BUG_ON(page->mapping != inode->i_mapping);
1889 ret = ext4_mb_init_cache(page, NULL);
1890 if (ret) {
1891 unlock_page(page);
1892 goto err;
1893 }
1894 unlock_page(page);
1895 }
1896 if (page == NULL || !PageUptodate(page)) {
1897 ret = -EIO;
1898 goto err;
1899 }
1900 mark_page_accessed(page);
1901 bitmap_page = page;
1902 bitmap = page_address(page) + (poff * sb->s_blocksize);
1903
1904 /* init buddy cache */
1905 block++;
1906 pnum = block / blocks_per_page;
1907 poff = block % blocks_per_page;
1908 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1909 if (page == bitmap_page) {
1910 /*
1911 * If both the bitmap and buddy are in
1912 * the same page we don't need to force
1913 * init the buddy
1914 */
1915 unlock_page(page);
1916 } else if (page) {
1917 BUG_ON(page->mapping != inode->i_mapping);
1918 ret = ext4_mb_init_cache(page, bitmap);
1919 if (ret) {
1920 unlock_page(page);
1921 goto err;
1922 }
1923 unlock_page(page);
1924 }
1925 if (page == NULL || !PageUptodate(page)) {
1926 ret = -EIO;
1927 goto err;
1928 }
1929 mark_page_accessed(page);
1930err:
1931 ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked);
1932 if (bitmap_page)
1933 page_cache_release(bitmap_page);
1934 if (page)
1935 page_cache_release(page);
1936 return ret;
1937}
1938
1695static noinline_for_stack int 1939static noinline_for_stack int
1696ext4_mb_regular_allocator(struct ext4_allocation_context *ac) 1940ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
1697{ 1941{
@@ -1775,7 +2019,7 @@ repeat:
1775 group = 0; 2019 group = 0;
1776 2020
1777 /* quick check to skip empty groups */ 2021 /* quick check to skip empty groups */
1778 grp = ext4_get_group_info(ac->ac_sb, group); 2022 grp = ext4_get_group_info(sb, group);
1779 if (grp->bb_free == 0) 2023 if (grp->bb_free == 0)
1780 continue; 2024 continue;
1781 2025
@@ -1788,10 +2032,9 @@ repeat:
1788 * we need full data about the group 2032 * we need full data about the group
1789 * to make a good selection 2033 * to make a good selection
1790 */ 2034 */
1791 err = ext4_mb_load_buddy(sb, group, &e4b); 2035 err = ext4_mb_init_group(sb, group);
1792 if (err) 2036 if (err)
1793 goto out; 2037 goto out;
1794 ext4_mb_release_desc(&e4b);
1795 } 2038 }
1796 2039
1797 /* 2040 /*
@@ -1932,13 +2175,13 @@ static int ext4_mb_seq_history_show(struct seq_file *seq, void *v)
1932 if (hs->op == EXT4_MB_HISTORY_ALLOC) { 2175 if (hs->op == EXT4_MB_HISTORY_ALLOC) {
1933 fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u " 2176 fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u "
1934 "%-5u %-5s %-5u %-6u\n"; 2177 "%-5u %-5s %-5u %-6u\n";
1935 sprintf(buf2, "%lu/%d/%u@%u", hs->result.fe_group, 2178 sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group,
1936 hs->result.fe_start, hs->result.fe_len, 2179 hs->result.fe_start, hs->result.fe_len,
1937 hs->result.fe_logical); 2180 hs->result.fe_logical);
1938 sprintf(buf, "%lu/%d/%u@%u", hs->orig.fe_group, 2181 sprintf(buf, "%u/%d/%u@%u", hs->orig.fe_group,
1939 hs->orig.fe_start, hs->orig.fe_len, 2182 hs->orig.fe_start, hs->orig.fe_len,
1940 hs->orig.fe_logical); 2183 hs->orig.fe_logical);
1941 sprintf(buf3, "%lu/%d/%u@%u", hs->goal.fe_group, 2184 sprintf(buf3, "%u/%d/%u@%u", hs->goal.fe_group,
1942 hs->goal.fe_start, hs->goal.fe_len, 2185 hs->goal.fe_start, hs->goal.fe_len,
1943 hs->goal.fe_logical); 2186 hs->goal.fe_logical);
1944 seq_printf(seq, fmt, hs->pid, hs->ino, buf, buf3, buf2, 2187 seq_printf(seq, fmt, hs->pid, hs->ino, buf, buf3, buf2,
@@ -1947,20 +2190,20 @@ static int ext4_mb_seq_history_show(struct seq_file *seq, void *v)
1947 hs->buddy ? 1 << hs->buddy : 0); 2190 hs->buddy ? 1 << hs->buddy : 0);
1948 } else if (hs->op == EXT4_MB_HISTORY_PREALLOC) { 2191 } else if (hs->op == EXT4_MB_HISTORY_PREALLOC) {
1949 fmt = "%-5u %-8u %-23s %-23s %-23s\n"; 2192 fmt = "%-5u %-8u %-23s %-23s %-23s\n";
1950 sprintf(buf2, "%lu/%d/%u@%u", hs->result.fe_group, 2193 sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group,
1951 hs->result.fe_start, hs->result.fe_len, 2194 hs->result.fe_start, hs->result.fe_len,
1952 hs->result.fe_logical); 2195 hs->result.fe_logical);
1953 sprintf(buf, "%lu/%d/%u@%u", hs->orig.fe_group, 2196 sprintf(buf, "%u/%d/%u@%u", hs->orig.fe_group,
1954 hs->orig.fe_start, hs->orig.fe_len, 2197 hs->orig.fe_start, hs->orig.fe_len,
1955 hs->orig.fe_logical); 2198 hs->orig.fe_logical);
1956 seq_printf(seq, fmt, hs->pid, hs->ino, buf, "", buf2); 2199 seq_printf(seq, fmt, hs->pid, hs->ino, buf, "", buf2);
1957 } else if (hs->op == EXT4_MB_HISTORY_DISCARD) { 2200 } else if (hs->op == EXT4_MB_HISTORY_DISCARD) {
1958 sprintf(buf2, "%lu/%d/%u", hs->result.fe_group, 2201 sprintf(buf2, "%u/%d/%u", hs->result.fe_group,
1959 hs->result.fe_start, hs->result.fe_len); 2202 hs->result.fe_start, hs->result.fe_len);
1960 seq_printf(seq, "%-5u %-8u %-23s discard\n", 2203 seq_printf(seq, "%-5u %-8u %-23s discard\n",
1961 hs->pid, hs->ino, buf2); 2204 hs->pid, hs->ino, buf2);
1962 } else if (hs->op == EXT4_MB_HISTORY_FREE) { 2205 } else if (hs->op == EXT4_MB_HISTORY_FREE) {
1963 sprintf(buf2, "%lu/%d/%u", hs->result.fe_group, 2206 sprintf(buf2, "%u/%d/%u", hs->result.fe_group,
1964 hs->result.fe_start, hs->result.fe_len); 2207 hs->result.fe_start, hs->result.fe_len);
1965 seq_printf(seq, "%-5u %-8u %-23s free\n", 2208 seq_printf(seq, "%-5u %-8u %-23s free\n",
1966 hs->pid, hs->ino, buf2); 2209 hs->pid, hs->ino, buf2);
@@ -2073,7 +2316,7 @@ static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
2073 return NULL; 2316 return NULL;
2074 2317
2075 group = *pos + 1; 2318 group = *pos + 1;
2076 return (void *) group; 2319 return (void *) ((unsigned long) group);
2077} 2320}
2078 2321
2079static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) 2322static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
@@ -2086,13 +2329,13 @@ static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
2086 if (*pos < 0 || *pos >= sbi->s_groups_count) 2329 if (*pos < 0 || *pos >= sbi->s_groups_count)
2087 return NULL; 2330 return NULL;
2088 group = *pos + 1; 2331 group = *pos + 1;
2089 return (void *) group;; 2332 return (void *) ((unsigned long) group);
2090} 2333}
2091 2334
2092static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) 2335static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
2093{ 2336{
2094 struct super_block *sb = seq->private; 2337 struct super_block *sb = seq->private;
2095 long group = (long) v; 2338 ext4_group_t group = (ext4_group_t) ((unsigned long) v);
2096 int i; 2339 int i;
2097 int err; 2340 int err;
2098 struct ext4_buddy e4b; 2341 struct ext4_buddy e4b;
@@ -2114,7 +2357,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
2114 sizeof(struct ext4_group_info); 2357 sizeof(struct ext4_group_info);
2115 err = ext4_mb_load_buddy(sb, group, &e4b); 2358 err = ext4_mb_load_buddy(sb, group, &e4b);
2116 if (err) { 2359 if (err) {
2117 seq_printf(seq, "#%-5lu: I/O error\n", group); 2360 seq_printf(seq, "#%-5u: I/O error\n", group);
2118 return 0; 2361 return 0;
2119 } 2362 }
2120 ext4_lock_group(sb, group); 2363 ext4_lock_group(sb, group);
@@ -2122,7 +2365,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
2122 ext4_unlock_group(sb, group); 2365 ext4_unlock_group(sb, group);
2123 ext4_mb_release_desc(&e4b); 2366 ext4_mb_release_desc(&e4b);
2124 2367
2125 seq_printf(seq, "#%-5lu: %-5u %-5u %-5u [", group, sg.info.bb_free, 2368 seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
2126 sg.info.bb_fragments, sg.info.bb_first_free); 2369 sg.info.bb_fragments, sg.info.bb_first_free);
2127 for (i = 0; i <= 13; i++) 2370 for (i = 0; i <= 13; i++)
2128 seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ? 2371 seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ?
@@ -2296,10 +2539,11 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2296 ext4_free_blocks_after_init(sb, group, desc); 2539 ext4_free_blocks_after_init(sb, group, desc);
2297 } else { 2540 } else {
2298 meta_group_info[i]->bb_free = 2541 meta_group_info[i]->bb_free =
2299 le16_to_cpu(desc->bg_free_blocks_count); 2542 ext4_free_blks_count(sb, desc);
2300 } 2543 }
2301 2544
2302 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); 2545 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
2546 init_rwsem(&meta_group_info[i]->alloc_sem);
2303 meta_group_info[i]->bb_free_root.rb_node = NULL;; 2547 meta_group_info[i]->bb_free_root.rb_node = NULL;;
2304 2548
2305#ifdef DOUBLE_CHECK 2549#ifdef DOUBLE_CHECK
@@ -2327,54 +2571,6 @@ exit_meta_group_info:
2327} /* ext4_mb_add_groupinfo */ 2571} /* ext4_mb_add_groupinfo */
2328 2572
2329/* 2573/*
2330 * Add a group to the existing groups.
2331 * This function is used for online resize
2332 */
2333int ext4_mb_add_more_groupinfo(struct super_block *sb, ext4_group_t group,
2334 struct ext4_group_desc *desc)
2335{
2336 struct ext4_sb_info *sbi = EXT4_SB(sb);
2337 struct inode *inode = sbi->s_buddy_cache;
2338 int blocks_per_page;
2339 int block;
2340 int pnum;
2341 struct page *page;
2342 int err;
2343
2344 /* Add group based on group descriptor*/
2345 err = ext4_mb_add_groupinfo(sb, group, desc);
2346 if (err)
2347 return err;
2348
2349 /*
2350 * Cache pages containing dynamic mb_alloc datas (buddy and bitmap
2351 * datas) are set not up to date so that they will be re-initilaized
2352 * during the next call to ext4_mb_load_buddy
2353 */
2354
2355 /* Set buddy page as not up to date */
2356 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
2357 block = group * 2;
2358 pnum = block / blocks_per_page;
2359 page = find_get_page(inode->i_mapping, pnum);
2360 if (page != NULL) {
2361 ClearPageUptodate(page);
2362 page_cache_release(page);
2363 }
2364
2365 /* Set bitmap page as not up to date */
2366 block++;
2367 pnum = block / blocks_per_page;
2368 page = find_get_page(inode->i_mapping, pnum);
2369 if (page != NULL) {
2370 ClearPageUptodate(page);
2371 page_cache_release(page);
2372 }
2373
2374 return 0;
2375}
2376
2377/*
2378 * Update an existing group. 2574 * Update an existing group.
2379 * This function is used for online resize 2575 * This function is used for online resize
2380 */ 2576 */
@@ -2457,7 +2653,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
2457 desc = ext4_get_group_desc(sb, i, NULL); 2653 desc = ext4_get_group_desc(sb, i, NULL);
2458 if (desc == NULL) { 2654 if (desc == NULL) {
2459 printk(KERN_ERR 2655 printk(KERN_ERR
2460 "EXT4-fs: can't read descriptor %lu\n", i); 2656 "EXT4-fs: can't read descriptor %u\n", i);
2461 goto err_freebuddy; 2657 goto err_freebuddy;
2462 } 2658 }
2463 if (ext4_mb_add_groupinfo(sb, i, desc) != 0) 2659 if (ext4_mb_add_groupinfo(sb, i, desc) != 0)
@@ -2493,6 +2689,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2493 if (sbi->s_mb_offsets == NULL) { 2689 if (sbi->s_mb_offsets == NULL) {
2494 return -ENOMEM; 2690 return -ENOMEM;
2495 } 2691 }
2692
2693 i = (sb->s_blocksize_bits + 2) * sizeof(unsigned int);
2496 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); 2694 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
2497 if (sbi->s_mb_maxs == NULL) { 2695 if (sbi->s_mb_maxs == NULL) {
2498 kfree(sbi->s_mb_maxs); 2696 kfree(sbi->s_mb_maxs);
@@ -2551,7 +2749,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2551 ext4_mb_init_per_dev_proc(sb); 2749 ext4_mb_init_per_dev_proc(sb);
2552 ext4_mb_history_init(sb); 2750 ext4_mb_history_init(sb);
2553 2751
2554 sbi->s_journal->j_commit_callback = release_blocks_on_commit; 2752 if (sbi->s_journal)
2753 sbi->s_journal->j_commit_callback = release_blocks_on_commit;
2555 2754
2556 printk(KERN_INFO "EXT4-fs: mballoc enabled\n"); 2755 printk(KERN_INFO "EXT4-fs: mballoc enabled\n");
2557 return 0; 2756 return 0;
@@ -2652,7 +2851,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2652 list_for_each_safe(l, ltmp, &txn->t_private_list) { 2851 list_for_each_safe(l, ltmp, &txn->t_private_list) {
2653 entry = list_entry(l, struct ext4_free_data, list); 2852 entry = list_entry(l, struct ext4_free_data, list);
2654 2853
2655 mb_debug("gonna free %u blocks in group %lu (0x%p):", 2854 mb_debug("gonna free %u blocks in group %u (0x%p):",
2656 entry->count, entry->group, entry); 2855 entry->count, entry->group, entry);
2657 2856
2658 err = ext4_mb_load_buddy(sb, entry->group, &e4b); 2857 err = ext4_mb_load_buddy(sb, entry->group, &e4b);
@@ -2679,8 +2878,9 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2679 discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb) 2878 discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb)
2680 + entry->start_blk 2879 + entry->start_blk
2681 + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); 2880 + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
2682 trace_mark(ext4_discard_blocks, "dev %s blk %llu count %u", sb->s_id, 2881 trace_mark(ext4_discard_blocks, "dev %s blk %llu count %u",
2683 (unsigned long long) discard_block, entry->count); 2882 sb->s_id, (unsigned long long) discard_block,
2883 entry->count);
2684 sb_issue_discard(sb, discard_block, entry->count); 2884 sb_issue_discard(sb, discard_block, entry->count);
2685 2885
2686 kmem_cache_free(ext4_free_ext_cachep, entry); 2886 kmem_cache_free(ext4_free_ext_cachep, entry);
@@ -2791,7 +2991,7 @@ void exit_ext4_mballoc(void)
2791 */ 2991 */
2792static noinline_for_stack int 2992static noinline_for_stack int
2793ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, 2993ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2794 handle_t *handle, unsigned long reserv_blks) 2994 handle_t *handle, unsigned int reserv_blks)
2795{ 2995{
2796 struct buffer_head *bitmap_bh = NULL; 2996 struct buffer_head *bitmap_bh = NULL;
2797 struct ext4_super_block *es; 2997 struct ext4_super_block *es;
@@ -2824,8 +3024,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2824 if (!gdp) 3024 if (!gdp)
2825 goto out_err; 3025 goto out_err;
2826 3026
2827 ext4_debug("using block group %lu(%d)\n", ac->ac_b_ex.fe_group, 3027 ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group,
2828 gdp->bg_free_blocks_count); 3028 ext4_free_blks_count(sb, gdp));
2829 3029
2830 err = ext4_journal_get_write_access(handle, gdp_bh); 3030 err = ext4_journal_get_write_access(handle, gdp_bh);
2831 if (err) 3031 if (err)
@@ -2843,8 +3043,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2843 in_range(block + len - 1, ext4_inode_table(sb, gdp), 3043 in_range(block + len - 1, ext4_inode_table(sb, gdp),
2844 EXT4_SB(sb)->s_itb_per_group)) { 3044 EXT4_SB(sb)->s_itb_per_group)) {
2845 ext4_error(sb, __func__, 3045 ext4_error(sb, __func__,
2846 "Allocating block in system zone - block = %llu", 3046 "Allocating block %llu in system zone of %d group\n",
2847 block); 3047 block, ac->ac_b_ex.fe_group);
2848 /* File system mounted not to panic on error 3048 /* File system mounted not to panic on error
2849 * Fix the bitmap and repeat the block allocation 3049 * Fix the bitmap and repeat the block allocation
2850 * We leak some of the blocks here. 3050 * We leak some of the blocks here.
@@ -2852,7 +3052,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2852 mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group), 3052 mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group),
2853 bitmap_bh->b_data, ac->ac_b_ex.fe_start, 3053 bitmap_bh->b_data, ac->ac_b_ex.fe_start,
2854 ac->ac_b_ex.fe_len); 3054 ac->ac_b_ex.fe_len);
2855 err = ext4_journal_dirty_metadata(handle, bitmap_bh); 3055 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
2856 if (!err) 3056 if (!err)
2857 err = -EAGAIN; 3057 err = -EAGAIN;
2858 goto out_err; 3058 goto out_err;
@@ -2866,18 +3066,17 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2866 } 3066 }
2867 } 3067 }
2868#endif 3068#endif
2869 mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group), bitmap_bh->b_data,
2870 ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len);
2871
2872 spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); 3069 spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
3070 mb_set_bits(NULL, bitmap_bh->b_data,
3071 ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len);
2873 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 3072 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
2874 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); 3073 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
2875 gdp->bg_free_blocks_count = 3074 ext4_free_blks_set(sb, gdp,
2876 cpu_to_le16(ext4_free_blocks_after_init(sb, 3075 ext4_free_blocks_after_init(sb,
2877 ac->ac_b_ex.fe_group, 3076 ac->ac_b_ex.fe_group, gdp));
2878 gdp));
2879 } 3077 }
2880 le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len); 3078 len = ext4_free_blks_count(sb, gdp) - ac->ac_b_ex.fe_len;
3079 ext4_free_blks_set(sb, gdp, len);
2881 gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp); 3080 gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
2882 spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); 3081 spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
2883 percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len); 3082 percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
@@ -2899,10 +3098,10 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2899 spin_unlock(sb_bgl_lock(sbi, flex_group)); 3098 spin_unlock(sb_bgl_lock(sbi, flex_group));
2900 } 3099 }
2901 3100
2902 err = ext4_journal_dirty_metadata(handle, bitmap_bh); 3101 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
2903 if (err) 3102 if (err)
2904 goto out_err; 3103 goto out_err;
2905 err = ext4_journal_dirty_metadata(handle, gdp_bh); 3104 err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh);
2906 3105
2907out_err: 3106out_err:
2908 sb->s_dirt = 1; 3107 sb->s_dirt = 1;
@@ -3031,7 +3230,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
3031 /* check we don't cross already preallocated blocks */ 3230 /* check we don't cross already preallocated blocks */
3032 rcu_read_lock(); 3231 rcu_read_lock();
3033 list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { 3232 list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
3034 unsigned long pa_end; 3233 ext4_lblk_t pa_end;
3035 3234
3036 if (pa->pa_deleted) 3235 if (pa->pa_deleted)
3037 continue; 3236 continue;
@@ -3075,7 +3274,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
3075 /* XXX: extra loop to check we really don't overlap preallocations */ 3274 /* XXX: extra loop to check we really don't overlap preallocations */
3076 rcu_read_lock(); 3275 rcu_read_lock();
3077 list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { 3276 list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
3078 unsigned long pa_end; 3277 ext4_lblk_t pa_end;
3079 spin_lock(&pa->pa_lock); 3278 spin_lock(&pa->pa_lock);
3080 if (pa->pa_deleted == 0) { 3279 if (pa->pa_deleted == 0) {
3081 pa_end = pa->pa_lstart + pa->pa_len; 3280 pa_end = pa->pa_lstart + pa->pa_len;
@@ -3307,6 +3506,32 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
3307} 3506}
3308 3507
3309/* 3508/*
3509 * the function goes through all block freed in the group
3510 * but not yet committed and marks them used in in-core bitmap.
3511 * buddy must be generated from this bitmap
3512 * Need to be called with ext4 group lock (ext4_lock_group)
3513 */
3514static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
3515 ext4_group_t group)
3516{
3517 struct rb_node *n;
3518 struct ext4_group_info *grp;
3519 struct ext4_free_data *entry;
3520
3521 grp = ext4_get_group_info(sb, group);
3522 n = rb_first(&(grp->bb_free_root));
3523
3524 while (n) {
3525 entry = rb_entry(n, struct ext4_free_data, node);
3526 mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group),
3527 bitmap, entry->start_blk,
3528 entry->count);
3529 n = rb_next(n);
3530 }
3531 return;
3532}
3533
3534/*
3310 * the function goes through all preallocation in this group and marks them 3535 * the function goes through all preallocation in this group and marks them
3311 * used in in-core bitmap. buddy must be generated from this bitmap 3536 * used in in-core bitmap. buddy must be generated from this bitmap
3312 * Need to be called with ext4 group lock (ext4_lock_group) 3537 * Need to be called with ext4 group lock (ext4_lock_group)
@@ -3346,7 +3571,7 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
3346 preallocated += len; 3571 preallocated += len;
3347 count++; 3572 count++;
3348 } 3573 }
3349 mb_debug("prellocated %u for group %lu\n", preallocated, group); 3574 mb_debug("prellocated %u for group %u\n", preallocated, group);
3350} 3575}
3351 3576
3352static void ext4_mb_pa_callback(struct rcu_head *head) 3577static void ext4_mb_pa_callback(struct rcu_head *head)
@@ -3363,7 +3588,7 @@ static void ext4_mb_pa_callback(struct rcu_head *head)
3363static void ext4_mb_put_pa(struct ext4_allocation_context *ac, 3588static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
3364 struct super_block *sb, struct ext4_prealloc_space *pa) 3589 struct super_block *sb, struct ext4_prealloc_space *pa)
3365{ 3590{
3366 unsigned long grp; 3591 ext4_group_t grp;
3367 3592
3368 if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) 3593 if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0)
3369 return; 3594 return;
@@ -3473,6 +3698,10 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
3473 3698
3474 mb_debug("new inode pa %p: %llu/%u for %u\n", pa, 3699 mb_debug("new inode pa %p: %llu/%u for %u\n", pa,
3475 pa->pa_pstart, pa->pa_len, pa->pa_lstart); 3700 pa->pa_pstart, pa->pa_len, pa->pa_lstart);
3701 trace_mark(ext4_mb_new_inode_pa,
3702 "dev %s ino %lu pstart %llu len %u lstart %u",
3703 sb->s_id, ac->ac_inode->i_ino,
3704 pa->pa_pstart, pa->pa_len, pa->pa_lstart);
3476 3705
3477 ext4_mb_use_inode_pa(ac, pa); 3706 ext4_mb_use_inode_pa(ac, pa);
3478 atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); 3707 atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
@@ -3530,7 +3759,9 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
3530 pa->pa_linear = 1; 3759 pa->pa_linear = 1;
3531 3760
3532 mb_debug("new group pa %p: %llu/%u for %u\n", pa, 3761 mb_debug("new group pa %p: %llu/%u for %u\n", pa,
3533 pa->pa_pstart, pa->pa_len, pa->pa_lstart); 3762 pa->pa_pstart, pa->pa_len, pa->pa_lstart);
3763 trace_mark(ext4_mb_new_group_pa, "dev %s pstart %llu len %u lstart %u",
3764 sb->s_id, pa->pa_pstart, pa->pa_len, pa->pa_lstart);
3534 3765
3535 ext4_mb_use_group_pa(ac, pa); 3766 ext4_mb_use_group_pa(ac, pa);
3536 atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); 3767 atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
@@ -3579,16 +3810,18 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3579{ 3810{
3580 struct super_block *sb = e4b->bd_sb; 3811 struct super_block *sb = e4b->bd_sb;
3581 struct ext4_sb_info *sbi = EXT4_SB(sb); 3812 struct ext4_sb_info *sbi = EXT4_SB(sb);
3582 unsigned long end; 3813 unsigned int end;
3583 unsigned long next; 3814 unsigned int next;
3584 ext4_group_t group; 3815 ext4_group_t group;
3585 ext4_grpblk_t bit; 3816 ext4_grpblk_t bit;
3817 unsigned long long grp_blk_start;
3586 sector_t start; 3818 sector_t start;
3587 int err = 0; 3819 int err = 0;
3588 int free = 0; 3820 int free = 0;
3589 3821
3590 BUG_ON(pa->pa_deleted == 0); 3822 BUG_ON(pa->pa_deleted == 0);
3591 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); 3823 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
3824 grp_blk_start = pa->pa_pstart - bit;
3592 BUG_ON(group != e4b->bd_group && pa->pa_len != 0); 3825 BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
3593 end = bit + pa->pa_len; 3826 end = bit + pa->pa_len;
3594 3827
@@ -3618,6 +3851,10 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3618 ext4_mb_store_history(ac); 3851 ext4_mb_store_history(ac);
3619 } 3852 }
3620 3853
3854 trace_mark(ext4_mb_release_inode_pa,
3855 "dev %s ino %lu block %llu count %u",
3856 sb->s_id, pa->pa_inode->i_ino, grp_blk_start + bit,
3857 next - bit);
3621 mb_free_blocks(pa->pa_inode, e4b, bit, next - bit); 3858 mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
3622 bit = next + 1; 3859 bit = next + 1;
3623 } 3860 }
@@ -3626,8 +3863,9 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3626 pa, (unsigned long) pa->pa_lstart, 3863 pa, (unsigned long) pa->pa_lstart,
3627 (unsigned long) pa->pa_pstart, 3864 (unsigned long) pa->pa_pstart,
3628 (unsigned long) pa->pa_len); 3865 (unsigned long) pa->pa_len);
3629 ext4_error(sb, __func__, "free %u, pa_free %u\n", 3866 ext4_grp_locked_error(sb, group,
3630 free, pa->pa_free); 3867 __func__, "free %u, pa_free %u",
3868 free, pa->pa_free);
3631 /* 3869 /*
3632 * pa is already deleted so we use the value obtained 3870 * pa is already deleted so we use the value obtained
3633 * from the bitmap and continue. 3871 * from the bitmap and continue.
@@ -3650,6 +3888,8 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
3650 if (ac) 3888 if (ac)
3651 ac->ac_op = EXT4_MB_HISTORY_DISCARD; 3889 ac->ac_op = EXT4_MB_HISTORY_DISCARD;
3652 3890
3891 trace_mark(ext4_mb_release_group_pa, "dev %s pstart %llu len %d",
3892 sb->s_id, pa->pa_pstart, pa->pa_len);
3653 BUG_ON(pa->pa_deleted == 0); 3893 BUG_ON(pa->pa_deleted == 0);
3654 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); 3894 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
3655 BUG_ON(group != e4b->bd_group && pa->pa_len != 0); 3895 BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
@@ -3692,7 +3932,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
3692 int busy = 0; 3932 int busy = 0;
3693 int free = 0; 3933 int free = 0;
3694 3934
3695 mb_debug("discard preallocation for group %lu\n", group); 3935 mb_debug("discard preallocation for group %u\n", group);
3696 3936
3697 if (list_empty(&grp->bb_prealloc_list)) 3937 if (list_empty(&grp->bb_prealloc_list))
3698 return 0; 3938 return 0;
@@ -3700,14 +3940,14 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
3700 bitmap_bh = ext4_read_block_bitmap(sb, group); 3940 bitmap_bh = ext4_read_block_bitmap(sb, group);
3701 if (bitmap_bh == NULL) { 3941 if (bitmap_bh == NULL) {
3702 ext4_error(sb, __func__, "Error in reading block " 3942 ext4_error(sb, __func__, "Error in reading block "
3703 "bitmap for %lu\n", group); 3943 "bitmap for %u", group);
3704 return 0; 3944 return 0;
3705 } 3945 }
3706 3946
3707 err = ext4_mb_load_buddy(sb, group, &e4b); 3947 err = ext4_mb_load_buddy(sb, group, &e4b);
3708 if (err) { 3948 if (err) {
3709 ext4_error(sb, __func__, "Error in loading buddy " 3949 ext4_error(sb, __func__, "Error in loading buddy "
3710 "information for %lu\n", group); 3950 "information for %u", group);
3711 put_bh(bitmap_bh); 3951 put_bh(bitmap_bh);
3712 return 0; 3952 return 0;
3713 } 3953 }
@@ -3815,6 +4055,8 @@ void ext4_discard_preallocations(struct inode *inode)
3815 } 4055 }
3816 4056
3817 mb_debug("discard preallocation for inode %lu\n", inode->i_ino); 4057 mb_debug("discard preallocation for inode %lu\n", inode->i_ino);
4058 trace_mark(ext4_discard_preallocations, "dev %s ino %lu", sb->s_id,
4059 inode->i_ino);
3818 4060
3819 INIT_LIST_HEAD(&list); 4061 INIT_LIST_HEAD(&list);
3820 4062
@@ -3874,14 +4116,14 @@ repeat:
3874 err = ext4_mb_load_buddy(sb, group, &e4b); 4116 err = ext4_mb_load_buddy(sb, group, &e4b);
3875 if (err) { 4117 if (err) {
3876 ext4_error(sb, __func__, "Error in loading buddy " 4118 ext4_error(sb, __func__, "Error in loading buddy "
3877 "information for %lu\n", group); 4119 "information for %u", group);
3878 continue; 4120 continue;
3879 } 4121 }
3880 4122
3881 bitmap_bh = ext4_read_block_bitmap(sb, group); 4123 bitmap_bh = ext4_read_block_bitmap(sb, group);
3882 if (bitmap_bh == NULL) { 4124 if (bitmap_bh == NULL) {
3883 ext4_error(sb, __func__, "Error in reading block " 4125 ext4_error(sb, __func__, "Error in reading block "
3884 "bitmap for %lu\n", group); 4126 "bitmap for %u", group);
3885 ext4_mb_release_desc(&e4b); 4127 ext4_mb_release_desc(&e4b);
3886 continue; 4128 continue;
3887 } 4129 }
@@ -4024,8 +4266,8 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
4024 struct ext4_sb_info *sbi = EXT4_SB(sb); 4266 struct ext4_sb_info *sbi = EXT4_SB(sb);
4025 struct ext4_super_block *es = sbi->s_es; 4267 struct ext4_super_block *es = sbi->s_es;
4026 ext4_group_t group; 4268 ext4_group_t group;
4027 unsigned long len; 4269 unsigned int len;
4028 unsigned long goal; 4270 ext4_fsblk_t goal;
4029 ext4_grpblk_t block; 4271 ext4_grpblk_t block;
4030 4272
4031 /* we can't allocate > group size */ 4273 /* we can't allocate > group size */
@@ -4068,6 +4310,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
4068 ac->ac_pa = NULL; 4310 ac->ac_pa = NULL;
4069 ac->ac_bitmap_page = NULL; 4311 ac->ac_bitmap_page = NULL;
4070 ac->ac_buddy_page = NULL; 4312 ac->ac_buddy_page = NULL;
4313 ac->alloc_semp = NULL;
4071 ac->ac_lg = NULL; 4314 ac->ac_lg = NULL;
4072 4315
4073 /* we have to define context: we'll we work with a file or 4316 /* we have to define context: we'll we work with a file or
@@ -4146,7 +4389,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
4146 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); 4389 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
4147 if (ext4_mb_load_buddy(sb, group, &e4b)) { 4390 if (ext4_mb_load_buddy(sb, group, &e4b)) {
4148 ext4_error(sb, __func__, "Error in loading buddy " 4391 ext4_error(sb, __func__, "Error in loading buddy "
4149 "information for %lu\n", group); 4392 "information for %u", group);
4150 continue; 4393 continue;
4151 } 4394 }
4152 ext4_lock_group(sb, group); 4395 ext4_lock_group(sb, group);
@@ -4248,6 +4491,8 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
4248 } 4491 }
4249 ext4_mb_put_pa(ac, ac->ac_sb, pa); 4492 ext4_mb_put_pa(ac, ac->ac_sb, pa);
4250 } 4493 }
4494 if (ac->alloc_semp)
4495 up_read(ac->alloc_semp);
4251 if (ac->ac_bitmap_page) 4496 if (ac->ac_bitmap_page)
4252 page_cache_release(ac->ac_bitmap_page); 4497 page_cache_release(ac->ac_bitmap_page);
4253 if (ac->ac_buddy_page) 4498 if (ac->ac_buddy_page)
@@ -4264,6 +4509,8 @@ static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
4264 int ret; 4509 int ret;
4265 int freed = 0; 4510 int freed = 0;
4266 4511
4512 trace_mark(ext4_mb_discard_preallocations, "dev %s needed %d",
4513 sb->s_id, needed);
4267 for (i = 0; i < EXT4_SB(sb)->s_groups_count && needed > 0; i++) { 4514 for (i = 0; i < EXT4_SB(sb)->s_groups_count && needed > 0; i++) {
4268 ret = ext4_mb_discard_group_preallocations(sb, i, needed); 4515 ret = ext4_mb_discard_group_preallocations(sb, i, needed);
4269 freed += ret; 4516 freed += ret;
@@ -4286,12 +4533,24 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4286 struct ext4_sb_info *sbi; 4533 struct ext4_sb_info *sbi;
4287 struct super_block *sb; 4534 struct super_block *sb;
4288 ext4_fsblk_t block = 0; 4535 ext4_fsblk_t block = 0;
4289 unsigned long inquota; 4536 unsigned int inquota;
4290 unsigned long reserv_blks = 0; 4537 unsigned int reserv_blks = 0;
4291 4538
4292 sb = ar->inode->i_sb; 4539 sb = ar->inode->i_sb;
4293 sbi = EXT4_SB(sb); 4540 sbi = EXT4_SB(sb);
4294 4541
4542 trace_mark(ext4_request_blocks, "dev %s flags %u len %u ino %lu "
4543 "lblk %llu goal %llu lleft %llu lright %llu "
4544 "pleft %llu pright %llu ",
4545 sb->s_id, ar->flags, ar->len,
4546 ar->inode ? ar->inode->i_ino : 0,
4547 (unsigned long long) ar->logical,
4548 (unsigned long long) ar->goal,
4549 (unsigned long long) ar->lleft,
4550 (unsigned long long) ar->lright,
4551 (unsigned long long) ar->pleft,
4552 (unsigned long long) ar->pright);
4553
4295 if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) { 4554 if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) {
4296 /* 4555 /*
4297 * With delalloc we already reserved the blocks 4556 * With delalloc we already reserved the blocks
@@ -4313,7 +4572,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4313 } 4572 }
4314 if (ar->len == 0) { 4573 if (ar->len == 0) {
4315 *errp = -EDQUOT; 4574 *errp = -EDQUOT;
4316 return 0; 4575 goto out3;
4317 } 4576 }
4318 inquota = ar->len; 4577 inquota = ar->len;
4319 4578
@@ -4348,10 +4607,14 @@ repeat:
4348 ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len) 4607 ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
4349 ext4_mb_new_preallocation(ac); 4608 ext4_mb_new_preallocation(ac);
4350 } 4609 }
4351
4352 if (likely(ac->ac_status == AC_STATUS_FOUND)) { 4610 if (likely(ac->ac_status == AC_STATUS_FOUND)) {
4353 *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks); 4611 *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks);
4354 if (*errp == -EAGAIN) { 4612 if (*errp == -EAGAIN) {
4613 /*
4614 * drop the reference that we took
4615 * in ext4_mb_use_best_found
4616 */
4617 ext4_mb_release_context(ac);
4355 ac->ac_b_ex.fe_group = 0; 4618 ac->ac_b_ex.fe_group = 0;
4356 ac->ac_b_ex.fe_start = 0; 4619 ac->ac_b_ex.fe_start = 0;
4357 ac->ac_b_ex.fe_len = 0; 4620 ac->ac_b_ex.fe_len = 0;
@@ -4382,6 +4645,26 @@ out2:
4382out1: 4645out1:
4383 if (ar->len < inquota) 4646 if (ar->len < inquota)
4384 DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len); 4647 DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len);
4648out3:
4649 if (!ar->len) {
4650 if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag)
4651 /* release all the reserved blocks if non delalloc */
4652 percpu_counter_sub(&sbi->s_dirtyblocks_counter,
4653 reserv_blks);
4654 }
4655
4656 trace_mark(ext4_allocate_blocks,
4657 "dev %s block %llu flags %u len %u ino %lu "
4658 "logical %llu goal %llu lleft %llu lright %llu "
4659 "pleft %llu pright %llu ",
4660 sb->s_id, (unsigned long long) block,
4661 ar->flags, ar->len, ar->inode ? ar->inode->i_ino : 0,
4662 (unsigned long long) ar->logical,
4663 (unsigned long long) ar->goal,
4664 (unsigned long long) ar->lleft,
4665 (unsigned long long) ar->lright,
4666 (unsigned long long) ar->pleft,
4667 (unsigned long long) ar->pright);
4385 4668
4386 return block; 4669 return block;
4387} 4670}
@@ -4403,27 +4686,23 @@ static int can_merge(struct ext4_free_data *entry1,
4403 4686
4404static noinline_for_stack int 4687static noinline_for_stack int
4405ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, 4688ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
4406 ext4_group_t group, ext4_grpblk_t block, int count) 4689 struct ext4_free_data *new_entry)
4407{ 4690{
4691 ext4_grpblk_t block;
4692 struct ext4_free_data *entry;
4408 struct ext4_group_info *db = e4b->bd_info; 4693 struct ext4_group_info *db = e4b->bd_info;
4409 struct super_block *sb = e4b->bd_sb; 4694 struct super_block *sb = e4b->bd_sb;
4410 struct ext4_sb_info *sbi = EXT4_SB(sb); 4695 struct ext4_sb_info *sbi = EXT4_SB(sb);
4411 struct ext4_free_data *entry, *new_entry;
4412 struct rb_node **n = &db->bb_free_root.rb_node, *node; 4696 struct rb_node **n = &db->bb_free_root.rb_node, *node;
4413 struct rb_node *parent = NULL, *new_node; 4697 struct rb_node *parent = NULL, *new_node;
4414 4698
4415 4699 BUG_ON(!ext4_handle_valid(handle));
4416 BUG_ON(e4b->bd_bitmap_page == NULL); 4700 BUG_ON(e4b->bd_bitmap_page == NULL);
4417 BUG_ON(e4b->bd_buddy_page == NULL); 4701 BUG_ON(e4b->bd_buddy_page == NULL);
4418 4702
4419 new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
4420 new_entry->start_blk = block;
4421 new_entry->group = group;
4422 new_entry->count = count;
4423 new_entry->t_tid = handle->h_transaction->t_tid;
4424 new_node = &new_entry->node; 4703 new_node = &new_entry->node;
4704 block = new_entry->start_blk;
4425 4705
4426 ext4_lock_group(sb, group);
4427 if (!*n) { 4706 if (!*n) {
4428 /* first free block exent. We need to 4707 /* first free block exent. We need to
4429 protect buddy cache from being freed, 4708 protect buddy cache from being freed,
@@ -4441,10 +4720,9 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
4441 else if (block >= (entry->start_blk + entry->count)) 4720 else if (block >= (entry->start_blk + entry->count))
4442 n = &(*n)->rb_right; 4721 n = &(*n)->rb_right;
4443 else { 4722 else {
4444 ext4_unlock_group(sb, group); 4723 ext4_grp_locked_error(sb, e4b->bd_group, __func__,
4445 ext4_error(sb, __func__, 4724 "Double free of blocks %d (%d %d)",
4446 "Double free of blocks %d (%d %d)\n", 4725 block, entry->start_blk, entry->count);
4447 block, entry->start_blk, entry->count);
4448 return 0; 4726 return 0;
4449 } 4727 }
4450 } 4728 }
@@ -4483,7 +4761,6 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
4483 spin_lock(&sbi->s_md_lock); 4761 spin_lock(&sbi->s_md_lock);
4484 list_add(&new_entry->list, &handle->h_transaction->t_private_list); 4762 list_add(&new_entry->list, &handle->h_transaction->t_private_list);
4485 spin_unlock(&sbi->s_md_lock); 4763 spin_unlock(&sbi->s_md_lock);
4486 ext4_unlock_group(sb, group);
4487 return 0; 4764 return 0;
4488} 4765}
4489 4766
@@ -4499,7 +4776,7 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
4499 struct ext4_allocation_context *ac = NULL; 4776 struct ext4_allocation_context *ac = NULL;
4500 struct ext4_group_desc *gdp; 4777 struct ext4_group_desc *gdp;
4501 struct ext4_super_block *es; 4778 struct ext4_super_block *es;
4502 unsigned long overflow; 4779 unsigned int overflow;
4503 ext4_grpblk_t bit; 4780 ext4_grpblk_t bit;
4504 struct buffer_head *gd_bh; 4781 struct buffer_head *gd_bh;
4505 ext4_group_t block_group; 4782 ext4_group_t block_group;
@@ -4522,6 +4799,10 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
4522 } 4799 }
4523 4800
4524 ext4_debug("freeing block %lu\n", block); 4801 ext4_debug("freeing block %lu\n", block);
4802 trace_mark(ext4_free_blocks,
4803 "dev %s block %llu count %lu metadata %d ino %lu",
4804 sb->s_id, (unsigned long long) block, count, metadata,
4805 inode ? inode->i_ino : 0);
4525 4806
4526 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); 4807 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
4527 if (ac) { 4808 if (ac) {
@@ -4581,11 +4862,6 @@ do_more:
4581 err = ext4_journal_get_write_access(handle, gd_bh); 4862 err = ext4_journal_get_write_access(handle, gd_bh);
4582 if (err) 4863 if (err)
4583 goto error_return; 4864 goto error_return;
4584
4585 err = ext4_mb_load_buddy(sb, block_group, &e4b);
4586 if (err)
4587 goto error_return;
4588
4589#ifdef AGGRESSIVE_CHECK 4865#ifdef AGGRESSIVE_CHECK
4590 { 4866 {
4591 int i; 4867 int i;
@@ -4593,13 +4869,6 @@ do_more:
4593 BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data)); 4869 BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
4594 } 4870 }
4595#endif 4871#endif
4596 mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
4597 bit, count);
4598
4599 /* We dirtied the bitmap block */
4600 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
4601 err = ext4_journal_dirty_metadata(handle, bitmap_bh);
4602
4603 if (ac) { 4872 if (ac) {
4604 ac->ac_b_ex.fe_group = block_group; 4873 ac->ac_b_ex.fe_group = block_group;
4605 ac->ac_b_ex.fe_start = bit; 4874 ac->ac_b_ex.fe_start = bit;
@@ -4607,19 +4876,41 @@ do_more:
4607 ext4_mb_store_history(ac); 4876 ext4_mb_store_history(ac);
4608 } 4877 }
4609 4878
4610 if (metadata) { 4879 err = ext4_mb_load_buddy(sb, block_group, &e4b);
4611 /* blocks being freed are metadata. these blocks shouldn't 4880 if (err)
4612 * be used until this transaction is committed */ 4881 goto error_return;
4613 ext4_mb_free_metadata(handle, &e4b, block_group, bit, count); 4882 if (metadata && ext4_handle_valid(handle)) {
4883 struct ext4_free_data *new_entry;
4884 /*
4885 * blocks being freed are metadata. these blocks shouldn't
4886 * be used until this transaction is committed
4887 */
4888 new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
4889 new_entry->start_blk = bit;
4890 new_entry->group = block_group;
4891 new_entry->count = count;
4892 new_entry->t_tid = handle->h_transaction->t_tid;
4893 ext4_lock_group(sb, block_group);
4894 mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
4895 bit, count);
4896 ext4_mb_free_metadata(handle, &e4b, new_entry);
4897 ext4_unlock_group(sb, block_group);
4614 } else { 4898 } else {
4615 ext4_lock_group(sb, block_group); 4899 ext4_lock_group(sb, block_group);
4900 /* need to update group_info->bb_free and bitmap
4901 * with group lock held. generate_buddy look at
4902 * them with group lock_held
4903 */
4904 mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
4905 bit, count);
4616 mb_free_blocks(inode, &e4b, bit, count); 4906 mb_free_blocks(inode, &e4b, bit, count);
4617 ext4_mb_return_to_preallocation(inode, &e4b, block, count); 4907 ext4_mb_return_to_preallocation(inode, &e4b, block, count);
4618 ext4_unlock_group(sb, block_group); 4908 ext4_unlock_group(sb, block_group);
4619 } 4909 }
4620 4910
4621 spin_lock(sb_bgl_lock(sbi, block_group)); 4911 spin_lock(sb_bgl_lock(sbi, block_group));
4622 le16_add_cpu(&gdp->bg_free_blocks_count, count); 4912 ret = ext4_free_blks_count(sb, gdp) + count;
4913 ext4_free_blks_set(sb, gdp, ret);
4623 gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); 4914 gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
4624 spin_unlock(sb_bgl_lock(sbi, block_group)); 4915 spin_unlock(sb_bgl_lock(sbi, block_group));
4625 percpu_counter_add(&sbi->s_freeblocks_counter, count); 4916 percpu_counter_add(&sbi->s_freeblocks_counter, count);
@@ -4635,9 +4926,13 @@ do_more:
4635 4926
4636 *freed += count; 4927 *freed += count;
4637 4928
4929 /* We dirtied the bitmap block */
4930 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
4931 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
4932
4638 /* And the group descriptor block */ 4933 /* And the group descriptor block */
4639 BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); 4934 BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
4640 ret = ext4_journal_dirty_metadata(handle, gd_bh); 4935 ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
4641 if (!err) 4936 if (!err)
4642 err = ret; 4937 err = ret;
4643 4938
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index b5dff1fff1e5..10a2921baf14 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -20,6 +20,7 @@
20#include <linux/version.h> 20#include <linux/version.h>
21#include <linux/blkdev.h> 21#include <linux/blkdev.h>
22#include <linux/marker.h> 22#include <linux/marker.h>
23#include <linux/mutex.h>
23#include "ext4_jbd2.h" 24#include "ext4_jbd2.h"
24#include "ext4.h" 25#include "ext4.h"
25#include "group.h" 26#include "group.h"
@@ -98,9 +99,6 @@
98 */ 99 */
99#define MB_DEFAULT_GROUP_PREALLOC 512 100#define MB_DEFAULT_GROUP_PREALLOC 512
100 101
101static struct kmem_cache *ext4_pspace_cachep;
102static struct kmem_cache *ext4_ac_cachep;
103static struct kmem_cache *ext4_free_ext_cachep;
104 102
105struct ext4_free_data { 103struct ext4_free_data {
106 /* this links the free block information from group_info */ 104 /* this links the free block information from group_info */
@@ -120,26 +118,6 @@ struct ext4_free_data {
120 tid_t t_tid; 118 tid_t t_tid;
121}; 119};
122 120
123struct ext4_group_info {
124 unsigned long bb_state;
125 struct rb_root bb_free_root;
126 unsigned short bb_first_free;
127 unsigned short bb_free;
128 unsigned short bb_fragments;
129 struct list_head bb_prealloc_list;
130#ifdef DOUBLE_CHECK
131 void *bb_bitmap;
132#endif
133 unsigned short bb_counters[];
134};
135
136#define EXT4_GROUP_INFO_NEED_INIT_BIT 0
137#define EXT4_GROUP_INFO_LOCKED_BIT 1
138
139#define EXT4_MB_GRP_NEED_INIT(grp) \
140 (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
141
142
143struct ext4_prealloc_space { 121struct ext4_prealloc_space {
144 struct list_head pa_inode_list; 122 struct list_head pa_inode_list;
145 struct list_head pa_group_list; 123 struct list_head pa_group_list;
@@ -217,6 +195,11 @@ struct ext4_allocation_context {
217 __u8 ac_op; /* operation, for history only */ 195 __u8 ac_op; /* operation, for history only */
218 struct page *ac_bitmap_page; 196 struct page *ac_bitmap_page;
219 struct page *ac_buddy_page; 197 struct page *ac_buddy_page;
198 /*
199 * pointer to the held semaphore upon successful
200 * block allocation
201 */
202 struct rw_semaphore *alloc_semp;
220 struct ext4_prealloc_space *ac_pa; 203 struct ext4_prealloc_space *ac_pa;
221 struct ext4_locality_group *ac_lg; 204 struct ext4_locality_group *ac_lg;
222}; 205};
@@ -250,6 +233,7 @@ struct ext4_buddy {
250 struct super_block *bd_sb; 233 struct super_block *bd_sb;
251 __u16 bd_blkbits; 234 __u16 bd_blkbits;
252 ext4_group_t bd_group; 235 ext4_group_t bd_group;
236 struct rw_semaphore *alloc_semp;
253}; 237};
254#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap) 238#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap)
255#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy) 239#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy)
@@ -259,51 +243,12 @@ static inline void ext4_mb_store_history(struct ext4_allocation_context *ac)
259{ 243{
260 return; 244 return;
261} 245}
262#else
263static void ext4_mb_store_history(struct ext4_allocation_context *ac);
264#endif 246#endif
265 247
266#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) 248#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
267 249
268struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t); 250struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t);
269 251static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
270static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
271 ext4_group_t group);
272static void ext4_mb_return_to_preallocation(struct inode *inode,
273 struct ext4_buddy *e4b, sector_t block,
274 int count);
275static void ext4_mb_put_pa(struct ext4_allocation_context *,
276 struct super_block *, struct ext4_prealloc_space *pa);
277static int ext4_mb_init_per_dev_proc(struct super_block *sb);
278static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
279static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
280
281
282static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
283{
284 struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
285
286 bit_spin_lock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
287}
288
289static inline void ext4_unlock_group(struct super_block *sb,
290 ext4_group_t group)
291{
292 struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
293
294 bit_spin_unlock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
295}
296
297static inline int ext4_is_group_locked(struct super_block *sb,
298 ext4_group_t group)
299{
300 struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
301
302 return bit_spin_is_locked(EXT4_GROUP_INFO_LOCKED_BIT,
303 &(grinfo->bb_state));
304}
305
306static ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
307 struct ext4_free_extent *fex) 252 struct ext4_free_extent *fex)
308{ 253{
309 ext4_fsblk_t block; 254 ext4_fsblk_t block;
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index f2a9cf498ecd..734abca25e35 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -59,7 +59,8 @@ static int finish_range(handle_t *handle, struct inode *inode,
59 /* 59 /*
60 * Make sure the credit we accumalated is not really high 60 * Make sure the credit we accumalated is not really high
61 */ 61 */
62 if (needed && handle->h_buffer_credits >= EXT4_RESERVE_TRANS_BLOCKS) { 62 if (needed && ext4_handle_has_enough_credits(handle,
63 EXT4_RESERVE_TRANS_BLOCKS)) {
63 retval = ext4_journal_restart(handle, needed); 64 retval = ext4_journal_restart(handle, needed);
64 if (retval) 65 if (retval)
65 goto err_out; 66 goto err_out;
@@ -229,7 +230,7 @@ static int extend_credit_for_blkdel(handle_t *handle, struct inode *inode)
229{ 230{
230 int retval = 0, needed; 231 int retval = 0, needed;
231 232
232 if (handle->h_buffer_credits > EXT4_RESERVE_TRANS_BLOCKS) 233 if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1))
233 return 0; 234 return 0;
234 /* 235 /*
235 * We are freeing a blocks. During this we touch 236 * We are freeing a blocks. During this we touch
@@ -458,13 +459,13 @@ int ext4_ext_migrate(struct inode *inode)
458 struct list_blocks_struct lb; 459 struct list_blocks_struct lb;
459 unsigned long max_entries; 460 unsigned long max_entries;
460 461
461 if (!test_opt(inode->i_sb, EXTENTS)) 462 /*
462 /* 463 * If the filesystem does not support extents, or the inode
463 * if mounted with noextents we don't allow the migrate 464 * already is extent-based, error out.
464 */ 465 */
465 return -EINVAL; 466 if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
466 467 EXT4_FEATURE_INCOMPAT_EXTENTS) ||
467 if ((EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) 468 (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
468 return -EINVAL; 469 return -EINVAL;
469 470
470 if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0) 471 if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 63adcb792988..ba702bd7910d 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -74,10 +74,6 @@ static struct buffer_head *ext4_append(handle_t *handle,
74#define assert(test) J_ASSERT(test) 74#define assert(test) J_ASSERT(test)
75#endif 75#endif
76 76
77#ifndef swap
78#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0)
79#endif
80
81#ifdef DX_DEBUG 77#ifdef DX_DEBUG
82#define dxtrace(command) command 78#define dxtrace(command) command
83#else 79#else
@@ -372,6 +368,8 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
372 goto fail; 368 goto fail;
373 } 369 }
374 hinfo->hash_version = root->info.hash_version; 370 hinfo->hash_version = root->info.hash_version;
371 if (hinfo->hash_version <= DX_HASH_TEA)
372 hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
375 hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed; 373 hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed;
376 if (d_name) 374 if (d_name)
377 ext4fs_dirhash(d_name->name, d_name->len, hinfo); 375 ext4fs_dirhash(d_name->name, d_name->len, hinfo);
@@ -641,6 +639,9 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
641 dir = dir_file->f_path.dentry->d_inode; 639 dir = dir_file->f_path.dentry->d_inode;
642 if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) { 640 if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) {
643 hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; 641 hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
642 if (hinfo.hash_version <= DX_HASH_TEA)
643 hinfo.hash_version +=
644 EXT4_SB(dir->i_sb)->s_hash_unsigned;
644 hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; 645 hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
645 count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo, 646 count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo,
646 start_hash, start_minor_hash); 647 start_hash, start_minor_hash);
@@ -806,7 +807,7 @@ static inline int ext4_match (int len, const char * const name,
806static inline int search_dirblock(struct buffer_head *bh, 807static inline int search_dirblock(struct buffer_head *bh,
807 struct inode *dir, 808 struct inode *dir,
808 const struct qstr *d_name, 809 const struct qstr *d_name,
809 unsigned long offset, 810 unsigned int offset,
810 struct ext4_dir_entry_2 ** res_dir) 811 struct ext4_dir_entry_2 ** res_dir)
811{ 812{
812 struct ext4_dir_entry_2 * de; 813 struct ext4_dir_entry_2 * de;
@@ -1043,11 +1044,11 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
1043 bh = ext4_find_entry(dir, &dentry->d_name, &de); 1044 bh = ext4_find_entry(dir, &dentry->d_name, &de);
1044 inode = NULL; 1045 inode = NULL;
1045 if (bh) { 1046 if (bh) {
1046 unsigned long ino = le32_to_cpu(de->inode); 1047 __u32 ino = le32_to_cpu(de->inode);
1047 brelse(bh); 1048 brelse(bh);
1048 if (!ext4_valid_inum(dir->i_sb, ino)) { 1049 if (!ext4_valid_inum(dir->i_sb, ino)) {
1049 ext4_error(dir->i_sb, "ext4_lookup", 1050 ext4_error(dir->i_sb, "ext4_lookup",
1050 "bad inode number: %lu", ino); 1051 "bad inode number: %u", ino);
1051 return ERR_PTR(-EIO); 1052 return ERR_PTR(-EIO);
1052 } 1053 }
1053 inode = ext4_iget(dir->i_sb, ino); 1054 inode = ext4_iget(dir->i_sb, ino);
@@ -1060,7 +1061,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
1060 1061
1061struct dentry *ext4_get_parent(struct dentry *child) 1062struct dentry *ext4_get_parent(struct dentry *child)
1062{ 1063{
1063 unsigned long ino; 1064 __u32 ino;
1064 struct inode *inode; 1065 struct inode *inode;
1065 static const struct qstr dotdot = { 1066 static const struct qstr dotdot = {
1066 .name = "..", 1067 .name = "..",
@@ -1078,7 +1079,7 @@ struct dentry *ext4_get_parent(struct dentry *child)
1078 1079
1079 if (!ext4_valid_inum(child->d_inode->i_sb, ino)) { 1080 if (!ext4_valid_inum(child->d_inode->i_sb, ino)) {
1080 ext4_error(child->d_inode->i_sb, "ext4_get_parent", 1081 ext4_error(child->d_inode->i_sb, "ext4_get_parent",
1081 "bad inode number: %lu", ino); 1082 "bad inode number: %u", ino);
1082 return ERR_PTR(-EIO); 1083 return ERR_PTR(-EIO);
1083 } 1084 }
1084 1085
@@ -1166,9 +1167,9 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
1166 u32 hash2; 1167 u32 hash2;
1167 struct dx_map_entry *map; 1168 struct dx_map_entry *map;
1168 char *data1 = (*bh)->b_data, *data2; 1169 char *data1 = (*bh)->b_data, *data2;
1169 unsigned split, move, size, i; 1170 unsigned split, move, size;
1170 struct ext4_dir_entry_2 *de = NULL, *de2; 1171 struct ext4_dir_entry_2 *de = NULL, *de2;
1171 int err = 0; 1172 int err = 0, i;
1172 1173
1173 bh2 = ext4_append (handle, dir, &newblock, &err); 1174 bh2 = ext4_append (handle, dir, &newblock, &err);
1174 if (!(bh2)) { 1175 if (!(bh2)) {
@@ -1228,10 +1229,10 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
1228 de = de2; 1229 de = de2;
1229 } 1230 }
1230 dx_insert_block(frame, hash2 + continued, newblock); 1231 dx_insert_block(frame, hash2 + continued, newblock);
1231 err = ext4_journal_dirty_metadata(handle, bh2); 1232 err = ext4_handle_dirty_metadata(handle, dir, bh2);
1232 if (err) 1233 if (err)
1233 goto journal_error; 1234 goto journal_error;
1234 err = ext4_journal_dirty_metadata(handle, frame->bh); 1235 err = ext4_handle_dirty_metadata(handle, dir, frame->bh);
1235 if (err) 1236 if (err)
1236 goto journal_error; 1237 goto journal_error;
1237 brelse(bh2); 1238 brelse(bh2);
@@ -1266,7 +1267,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1266 struct inode *dir = dentry->d_parent->d_inode; 1267 struct inode *dir = dentry->d_parent->d_inode;
1267 const char *name = dentry->d_name.name; 1268 const char *name = dentry->d_name.name;
1268 int namelen = dentry->d_name.len; 1269 int namelen = dentry->d_name.len;
1269 unsigned long offset = 0; 1270 unsigned int offset = 0;
1270 unsigned short reclen; 1271 unsigned short reclen;
1271 int nlen, rlen, err; 1272 int nlen, rlen, err;
1272 char *top; 1273 char *top;
@@ -1335,8 +1336,8 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1335 ext4_update_dx_flag(dir); 1336 ext4_update_dx_flag(dir);
1336 dir->i_version++; 1337 dir->i_version++;
1337 ext4_mark_inode_dirty(handle, dir); 1338 ext4_mark_inode_dirty(handle, dir);
1338 BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); 1339 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
1339 err = ext4_journal_dirty_metadata(handle, bh); 1340 err = ext4_handle_dirty_metadata(handle, dir, bh);
1340 if (err) 1341 if (err)
1341 ext4_std_error(dir->i_sb, err); 1342 ext4_std_error(dir->i_sb, err);
1342 brelse(bh); 1343 brelse(bh);
@@ -1367,7 +1368,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1367 struct fake_dirent *fde; 1368 struct fake_dirent *fde;
1368 1369
1369 blocksize = dir->i_sb->s_blocksize; 1370 blocksize = dir->i_sb->s_blocksize;
1370 dxtrace(printk(KERN_DEBUG "Creating index\n")); 1371 dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino));
1371 retval = ext4_journal_get_write_access(handle, bh); 1372 retval = ext4_journal_get_write_access(handle, bh);
1372 if (retval) { 1373 if (retval) {
1373 ext4_std_error(dir->i_sb, retval); 1374 ext4_std_error(dir->i_sb, retval);
@@ -1376,6 +1377,20 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1376 } 1377 }
1377 root = (struct dx_root *) bh->b_data; 1378 root = (struct dx_root *) bh->b_data;
1378 1379
1380 /* The 0th block becomes the root, move the dirents out */
1381 fde = &root->dotdot;
1382 de = (struct ext4_dir_entry_2 *)((char *)fde +
1383 ext4_rec_len_from_disk(fde->rec_len));
1384 if ((char *) de >= (((char *) root) + blocksize)) {
1385 ext4_error(dir->i_sb, __func__,
1386 "invalid rec_len for '..' in inode %lu",
1387 dir->i_ino);
1388 brelse(bh);
1389 return -EIO;
1390 }
1391 len = ((char *) root) + blocksize - (char *) de;
1392
1393 /* Allocate new block for the 0th block's dirents */
1379 bh2 = ext4_append(handle, dir, &block, &retval); 1394 bh2 = ext4_append(handle, dir, &block, &retval);
1380 if (!(bh2)) { 1395 if (!(bh2)) {
1381 brelse(bh); 1396 brelse(bh);
@@ -1384,11 +1399,6 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1384 EXT4_I(dir)->i_flags |= EXT4_INDEX_FL; 1399 EXT4_I(dir)->i_flags |= EXT4_INDEX_FL;
1385 data1 = bh2->b_data; 1400 data1 = bh2->b_data;
1386 1401
1387 /* The 0th block becomes the root, move the dirents out */
1388 fde = &root->dotdot;
1389 de = (struct ext4_dir_entry_2 *)((char *)fde +
1390 ext4_rec_len_from_disk(fde->rec_len));
1391 len = ((char *) root) + blocksize - (char *) de;
1392 memcpy (data1, de, len); 1402 memcpy (data1, de, len);
1393 de = (struct ext4_dir_entry_2 *) data1; 1403 de = (struct ext4_dir_entry_2 *) data1;
1394 top = data1 + len; 1404 top = data1 + len;
@@ -1408,6 +1418,8 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1408 1418
1409 /* Initialize as for dx_probe */ 1419 /* Initialize as for dx_probe */
1410 hinfo.hash_version = root->info.hash_version; 1420 hinfo.hash_version = root->info.hash_version;
1421 if (hinfo.hash_version <= DX_HASH_TEA)
1422 hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
1411 hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; 1423 hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
1412 ext4fs_dirhash(name, namelen, &hinfo); 1424 ext4fs_dirhash(name, namelen, &hinfo);
1413 frame = frames; 1425 frame = frames;
@@ -1437,7 +1449,6 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1437 struct inode *inode) 1449 struct inode *inode)
1438{ 1450{
1439 struct inode *dir = dentry->d_parent->d_inode; 1451 struct inode *dir = dentry->d_parent->d_inode;
1440 unsigned long offset;
1441 struct buffer_head *bh; 1452 struct buffer_head *bh;
1442 struct ext4_dir_entry_2 *de; 1453 struct ext4_dir_entry_2 *de;
1443 struct super_block *sb; 1454 struct super_block *sb;
@@ -1459,7 +1470,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1459 ext4_mark_inode_dirty(handle, dir); 1470 ext4_mark_inode_dirty(handle, dir);
1460 } 1471 }
1461 blocks = dir->i_size >> sb->s_blocksize_bits; 1472 blocks = dir->i_size >> sb->s_blocksize_bits;
1462 for (block = 0, offset = 0; block < blocks; block++) { 1473 for (block = 0; block < blocks; block++) {
1463 bh = ext4_bread(handle, dir, block, 0, &retval); 1474 bh = ext4_bread(handle, dir, block, 0, &retval);
1464 if(!bh) 1475 if(!bh)
1465 return retval; 1476 return retval;
@@ -1574,7 +1585,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1574 dxtrace(dx_show_index("node", frames[1].entries)); 1585 dxtrace(dx_show_index("node", frames[1].entries));
1575 dxtrace(dx_show_index("node", 1586 dxtrace(dx_show_index("node",
1576 ((struct dx_node *) bh2->b_data)->entries)); 1587 ((struct dx_node *) bh2->b_data)->entries));
1577 err = ext4_journal_dirty_metadata(handle, bh2); 1588 err = ext4_handle_dirty_metadata(handle, inode, bh2);
1578 if (err) 1589 if (err)
1579 goto journal_error; 1590 goto journal_error;
1580 brelse (bh2); 1591 brelse (bh2);
@@ -1600,7 +1611,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1600 if (err) 1611 if (err)
1601 goto journal_error; 1612 goto journal_error;
1602 } 1613 }
1603 ext4_journal_dirty_metadata(handle, frames[0].bh); 1614 ext4_handle_dirty_metadata(handle, inode, frames[0].bh);
1604 } 1615 }
1605 de = do_split(handle, dir, &bh, frame, &hinfo, &err); 1616 de = do_split(handle, dir, &bh, frame, &hinfo, &err);
1606 if (!de) 1617 if (!de)
@@ -1646,8 +1657,8 @@ static int ext4_delete_entry(handle_t *handle,
1646 else 1657 else
1647 de->inode = 0; 1658 de->inode = 0;
1648 dir->i_version++; 1659 dir->i_version++;
1649 BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); 1660 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
1650 ext4_journal_dirty_metadata(handle, bh); 1661 ext4_handle_dirty_metadata(handle, dir, bh);
1651 return 0; 1662 return 0;
1652 } 1663 }
1653 i += ext4_rec_len_from_disk(de->rec_len); 1664 i += ext4_rec_len_from_disk(de->rec_len);
@@ -1693,9 +1704,11 @@ static int ext4_add_nondir(handle_t *handle,
1693 if (!err) { 1704 if (!err) {
1694 ext4_mark_inode_dirty(handle, inode); 1705 ext4_mark_inode_dirty(handle, inode);
1695 d_instantiate(dentry, inode); 1706 d_instantiate(dentry, inode);
1707 unlock_new_inode(inode);
1696 return 0; 1708 return 0;
1697 } 1709 }
1698 drop_nlink(inode); 1710 drop_nlink(inode);
1711 unlock_new_inode(inode);
1699 iput(inode); 1712 iput(inode);
1700 return err; 1713 return err;
1701} 1714}
@@ -1723,7 +1736,7 @@ retry:
1723 return PTR_ERR(handle); 1736 return PTR_ERR(handle);
1724 1737
1725 if (IS_DIRSYNC(dir)) 1738 if (IS_DIRSYNC(dir))
1726 handle->h_sync = 1; 1739 ext4_handle_sync(handle);
1727 1740
1728 inode = ext4_new_inode (handle, dir, mode); 1741 inode = ext4_new_inode (handle, dir, mode);
1729 err = PTR_ERR(inode); 1742 err = PTR_ERR(inode);
@@ -1757,7 +1770,7 @@ retry:
1757 return PTR_ERR(handle); 1770 return PTR_ERR(handle);
1758 1771
1759 if (IS_DIRSYNC(dir)) 1772 if (IS_DIRSYNC(dir))
1760 handle->h_sync = 1; 1773 ext4_handle_sync(handle);
1761 1774
1762 inode = ext4_new_inode(handle, dir, mode); 1775 inode = ext4_new_inode(handle, dir, mode);
1763 err = PTR_ERR(inode); 1776 err = PTR_ERR(inode);
@@ -1793,7 +1806,7 @@ retry:
1793 return PTR_ERR(handle); 1806 return PTR_ERR(handle);
1794 1807
1795 if (IS_DIRSYNC(dir)) 1808 if (IS_DIRSYNC(dir))
1796 handle->h_sync = 1; 1809 ext4_handle_sync(handle);
1797 1810
1798 inode = ext4_new_inode(handle, dir, S_IFDIR | mode); 1811 inode = ext4_new_inode(handle, dir, S_IFDIR | mode);
1799 err = PTR_ERR(inode); 1812 err = PTR_ERR(inode);
@@ -1822,14 +1835,15 @@ retry:
1822 strcpy(de->name, ".."); 1835 strcpy(de->name, "..");
1823 ext4_set_de_type(dir->i_sb, de, S_IFDIR); 1836 ext4_set_de_type(dir->i_sb, de, S_IFDIR);
1824 inode->i_nlink = 2; 1837 inode->i_nlink = 2;
1825 BUFFER_TRACE(dir_block, "call ext4_journal_dirty_metadata"); 1838 BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
1826 ext4_journal_dirty_metadata(handle, dir_block); 1839 ext4_handle_dirty_metadata(handle, dir, dir_block);
1827 brelse(dir_block); 1840 brelse(dir_block);
1828 ext4_mark_inode_dirty(handle, inode); 1841 ext4_mark_inode_dirty(handle, inode);
1829 err = ext4_add_entry(handle, dentry, inode); 1842 err = ext4_add_entry(handle, dentry, inode);
1830 if (err) { 1843 if (err) {
1831out_clear_inode: 1844out_clear_inode:
1832 clear_nlink(inode); 1845 clear_nlink(inode);
1846 unlock_new_inode(inode);
1833 ext4_mark_inode_dirty(handle, inode); 1847 ext4_mark_inode_dirty(handle, inode);
1834 iput(inode); 1848 iput(inode);
1835 goto out_stop; 1849 goto out_stop;
@@ -1838,6 +1852,7 @@ out_clear_inode:
1838 ext4_update_dx_flag(dir); 1852 ext4_update_dx_flag(dir);
1839 ext4_mark_inode_dirty(handle, dir); 1853 ext4_mark_inode_dirty(handle, dir);
1840 d_instantiate(dentry, inode); 1854 d_instantiate(dentry, inode);
1855 unlock_new_inode(inode);
1841out_stop: 1856out_stop:
1842 ext4_journal_stop(handle); 1857 ext4_journal_stop(handle);
1843 if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) 1858 if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
@@ -1850,7 +1865,7 @@ out_stop:
1850 */ 1865 */
1851static int empty_dir(struct inode *inode) 1866static int empty_dir(struct inode *inode)
1852{ 1867{
1853 unsigned long offset; 1868 unsigned int offset;
1854 struct buffer_head *bh; 1869 struct buffer_head *bh;
1855 struct ext4_dir_entry_2 *de, *de1; 1870 struct ext4_dir_entry_2 *de, *de1;
1856 struct super_block *sb; 1871 struct super_block *sb;
@@ -1895,7 +1910,7 @@ static int empty_dir(struct inode *inode)
1895 if (err) 1910 if (err)
1896 ext4_error(sb, __func__, 1911 ext4_error(sb, __func__,
1897 "error %d reading directory" 1912 "error %d reading directory"
1898 " #%lu offset %lu", 1913 " #%lu offset %u",
1899 err, inode->i_ino, offset); 1914 err, inode->i_ino, offset);
1900 offset += sb->s_blocksize; 1915 offset += sb->s_blocksize;
1901 continue; 1916 continue;
@@ -1933,6 +1948,9 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
1933 struct ext4_iloc iloc; 1948 struct ext4_iloc iloc;
1934 int err = 0, rc; 1949 int err = 0, rc;
1935 1950
1951 if (!ext4_handle_valid(handle))
1952 return 0;
1953
1936 lock_super(sb); 1954 lock_super(sb);
1937 if (!list_empty(&EXT4_I(inode)->i_orphan)) 1955 if (!list_empty(&EXT4_I(inode)->i_orphan))
1938 goto out_unlock; 1956 goto out_unlock;
@@ -1961,7 +1979,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
1961 /* Insert this inode at the head of the on-disk orphan list... */ 1979 /* Insert this inode at the head of the on-disk orphan list... */
1962 NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan); 1980 NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
1963 EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); 1981 EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
1964 err = ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh); 1982 err = ext4_handle_dirty_metadata(handle, inode, EXT4_SB(sb)->s_sbh);
1965 rc = ext4_mark_iloc_dirty(handle, inode, &iloc); 1983 rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
1966 if (!err) 1984 if (!err)
1967 err = rc; 1985 err = rc;
@@ -1995,10 +2013,13 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
1995 struct list_head *prev; 2013 struct list_head *prev;
1996 struct ext4_inode_info *ei = EXT4_I(inode); 2014 struct ext4_inode_info *ei = EXT4_I(inode);
1997 struct ext4_sb_info *sbi; 2015 struct ext4_sb_info *sbi;
1998 unsigned long ino_next; 2016 __u32 ino_next;
1999 struct ext4_iloc iloc; 2017 struct ext4_iloc iloc;
2000 int err = 0; 2018 int err = 0;
2001 2019
2020 if (!ext4_handle_valid(handle))
2021 return 0;
2022
2002 lock_super(inode->i_sb); 2023 lock_super(inode->i_sb);
2003 if (list_empty(&ei->i_orphan)) { 2024 if (list_empty(&ei->i_orphan)) {
2004 unlock_super(inode->i_sb); 2025 unlock_super(inode->i_sb);
@@ -2017,7 +2038,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
2017 * transaction handle with which to update the orphan list on 2038 * transaction handle with which to update the orphan list on
2018 * disk, but we still need to remove the inode from the linked 2039 * disk, but we still need to remove the inode from the linked
2019 * list in memory. */ 2040 * list in memory. */
2020 if (!handle) 2041 if (sbi->s_journal && !handle)
2021 goto out; 2042 goto out;
2022 2043
2023 err = ext4_reserve_inode_write(handle, inode, &iloc); 2044 err = ext4_reserve_inode_write(handle, inode, &iloc);
@@ -2025,19 +2046,19 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
2025 goto out_err; 2046 goto out_err;
2026 2047
2027 if (prev == &sbi->s_orphan) { 2048 if (prev == &sbi->s_orphan) {
2028 jbd_debug(4, "superblock will point to %lu\n", ino_next); 2049 jbd_debug(4, "superblock will point to %u\n", ino_next);
2029 BUFFER_TRACE(sbi->s_sbh, "get_write_access"); 2050 BUFFER_TRACE(sbi->s_sbh, "get_write_access");
2030 err = ext4_journal_get_write_access(handle, sbi->s_sbh); 2051 err = ext4_journal_get_write_access(handle, sbi->s_sbh);
2031 if (err) 2052 if (err)
2032 goto out_brelse; 2053 goto out_brelse;
2033 sbi->s_es->s_last_orphan = cpu_to_le32(ino_next); 2054 sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
2034 err = ext4_journal_dirty_metadata(handle, sbi->s_sbh); 2055 err = ext4_handle_dirty_metadata(handle, inode, sbi->s_sbh);
2035 } else { 2056 } else {
2036 struct ext4_iloc iloc2; 2057 struct ext4_iloc iloc2;
2037 struct inode *i_prev = 2058 struct inode *i_prev =
2038 &list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode; 2059 &list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode;
2039 2060
2040 jbd_debug(4, "orphan inode %lu will point to %lu\n", 2061 jbd_debug(4, "orphan inode %lu will point to %u\n",
2041 i_prev->i_ino, ino_next); 2062 i_prev->i_ino, ino_next);
2042 err = ext4_reserve_inode_write(handle, i_prev, &iloc2); 2063 err = ext4_reserve_inode_write(handle, i_prev, &iloc2);
2043 if (err) 2064 if (err)
@@ -2082,7 +2103,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
2082 goto end_rmdir; 2103 goto end_rmdir;
2083 2104
2084 if (IS_DIRSYNC(dir)) 2105 if (IS_DIRSYNC(dir))
2085 handle->h_sync = 1; 2106 ext4_handle_sync(handle);
2086 2107
2087 inode = dentry->d_inode; 2108 inode = dentry->d_inode;
2088 2109
@@ -2136,7 +2157,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
2136 return PTR_ERR(handle); 2157 return PTR_ERR(handle);
2137 2158
2138 if (IS_DIRSYNC(dir)) 2159 if (IS_DIRSYNC(dir))
2139 handle->h_sync = 1; 2160 ext4_handle_sync(handle);
2140 2161
2141 retval = -ENOENT; 2162 retval = -ENOENT;
2142 bh = ext4_find_entry(dir, &dentry->d_name, &de); 2163 bh = ext4_find_entry(dir, &dentry->d_name, &de);
@@ -2193,7 +2214,7 @@ retry:
2193 return PTR_ERR(handle); 2214 return PTR_ERR(handle);
2194 2215
2195 if (IS_DIRSYNC(dir)) 2216 if (IS_DIRSYNC(dir))
2196 handle->h_sync = 1; 2217 ext4_handle_sync(handle);
2197 2218
2198 inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO); 2219 inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO);
2199 err = PTR_ERR(inode); 2220 err = PTR_ERR(inode);
@@ -2208,10 +2229,10 @@ retry:
2208 * We have a transaction open. All is sweetness. It also sets 2229 * We have a transaction open. All is sweetness. It also sets
2209 * i_size in generic_commit_write(). 2230 * i_size in generic_commit_write().
2210 */ 2231 */
2211 err = __page_symlink(inode, symname, l, 2232 err = __page_symlink(inode, symname, l, 1);
2212 mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
2213 if (err) { 2233 if (err) {
2214 clear_nlink(inode); 2234 clear_nlink(inode);
2235 unlock_new_inode(inode);
2215 ext4_mark_inode_dirty(handle, inode); 2236 ext4_mark_inode_dirty(handle, inode);
2216 iput(inode); 2237 iput(inode);
2217 goto out_stop; 2238 goto out_stop;
@@ -2256,13 +2277,20 @@ retry:
2256 return PTR_ERR(handle); 2277 return PTR_ERR(handle);
2257 2278
2258 if (IS_DIRSYNC(dir)) 2279 if (IS_DIRSYNC(dir))
2259 handle->h_sync = 1; 2280 ext4_handle_sync(handle);
2260 2281
2261 inode->i_ctime = ext4_current_time(inode); 2282 inode->i_ctime = ext4_current_time(inode);
2262 ext4_inc_count(handle, inode); 2283 ext4_inc_count(handle, inode);
2263 atomic_inc(&inode->i_count); 2284 atomic_inc(&inode->i_count);
2264 2285
2265 err = ext4_add_nondir(handle, dentry, inode); 2286 err = ext4_add_entry(handle, dentry, inode);
2287 if (!err) {
2288 ext4_mark_inode_dirty(handle, inode);
2289 d_instantiate(dentry, inode);
2290 } else {
2291 drop_nlink(inode);
2292 iput(inode);
2293 }
2266 ext4_journal_stop(handle); 2294 ext4_journal_stop(handle);
2267 if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) 2295 if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
2268 goto retry; 2296 goto retry;
@@ -2298,7 +2326,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2298 return PTR_ERR(handle); 2326 return PTR_ERR(handle);
2299 2327
2300 if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) 2328 if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
2301 handle->h_sync = 1; 2329 ext4_handle_sync(handle);
2302 2330
2303 old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de); 2331 old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de);
2304 /* 2332 /*
@@ -2352,8 +2380,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2352 new_dir->i_ctime = new_dir->i_mtime = 2380 new_dir->i_ctime = new_dir->i_mtime =
2353 ext4_current_time(new_dir); 2381 ext4_current_time(new_dir);
2354 ext4_mark_inode_dirty(handle, new_dir); 2382 ext4_mark_inode_dirty(handle, new_dir);
2355 BUFFER_TRACE(new_bh, "call ext4_journal_dirty_metadata"); 2383 BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata");
2356 ext4_journal_dirty_metadata(handle, new_bh); 2384 ext4_handle_dirty_metadata(handle, new_dir, new_bh);
2357 brelse(new_bh); 2385 brelse(new_bh);
2358 new_bh = NULL; 2386 new_bh = NULL;
2359 } 2387 }
@@ -2403,8 +2431,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2403 BUFFER_TRACE(dir_bh, "get_write_access"); 2431 BUFFER_TRACE(dir_bh, "get_write_access");
2404 ext4_journal_get_write_access(handle, dir_bh); 2432 ext4_journal_get_write_access(handle, dir_bh);
2405 PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino); 2433 PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino);
2406 BUFFER_TRACE(dir_bh, "call ext4_journal_dirty_metadata"); 2434 BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
2407 ext4_journal_dirty_metadata(handle, dir_bh); 2435 ext4_handle_dirty_metadata(handle, old_dir, dir_bh);
2408 ext4_dec_count(handle, old_dir); 2436 ext4_dec_count(handle, old_dir);
2409 if (new_inode) { 2437 if (new_inode) {
2410 /* checked empty_dir above, can't have another parent, 2438 /* checked empty_dir above, can't have another parent,
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index b6ec1843a015..c06886abd658 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -50,7 +50,7 @@ static int verify_group_input(struct super_block *sb,
50 ext4_get_group_no_and_offset(sb, start, NULL, &offset); 50 ext4_get_group_no_and_offset(sb, start, NULL, &offset);
51 if (group != sbi->s_groups_count) 51 if (group != sbi->s_groups_count)
52 ext4_warning(sb, __func__, 52 ext4_warning(sb, __func__,
53 "Cannot add at group %u (only %lu groups)", 53 "Cannot add at group %u (only %u groups)",
54 input->group, sbi->s_groups_count); 54 input->group, sbi->s_groups_count);
55 else if (offset != 0) 55 else if (offset != 0)
56 ext4_warning(sb, __func__, "Last group not full"); 56 ext4_warning(sb, __func__, "Last group not full");
@@ -149,7 +149,7 @@ static int extend_or_restart_transaction(handle_t *handle, int thresh,
149{ 149{
150 int err; 150 int err;
151 151
152 if (handle->h_buffer_credits >= thresh) 152 if (ext4_handle_has_enough_credits(handle, thresh))
153 return 0; 153 return 0;
154 154
155 err = ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA); 155 err = ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA);
@@ -232,7 +232,7 @@ static int setup_new_group_blocks(struct super_block *sb,
232 memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size); 232 memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size);
233 set_buffer_uptodate(gdb); 233 set_buffer_uptodate(gdb);
234 unlock_buffer(gdb); 234 unlock_buffer(gdb);
235 ext4_journal_dirty_metadata(handle, gdb); 235 ext4_handle_dirty_metadata(handle, NULL, gdb);
236 ext4_set_bit(bit, bh->b_data); 236 ext4_set_bit(bit, bh->b_data);
237 brelse(gdb); 237 brelse(gdb);
238 } 238 }
@@ -251,7 +251,7 @@ static int setup_new_group_blocks(struct super_block *sb,
251 err = PTR_ERR(bh); 251 err = PTR_ERR(bh);
252 goto exit_bh; 252 goto exit_bh;
253 } 253 }
254 ext4_journal_dirty_metadata(handle, gdb); 254 ext4_handle_dirty_metadata(handle, NULL, gdb);
255 ext4_set_bit(bit, bh->b_data); 255 ext4_set_bit(bit, bh->b_data);
256 brelse(gdb); 256 brelse(gdb);
257 } 257 }
@@ -276,7 +276,7 @@ static int setup_new_group_blocks(struct super_block *sb,
276 err = PTR_ERR(it); 276 err = PTR_ERR(it);
277 goto exit_bh; 277 goto exit_bh;
278 } 278 }
279 ext4_journal_dirty_metadata(handle, it); 279 ext4_handle_dirty_metadata(handle, NULL, it);
280 brelse(it); 280 brelse(it);
281 ext4_set_bit(bit, bh->b_data); 281 ext4_set_bit(bit, bh->b_data);
282 } 282 }
@@ -284,11 +284,9 @@ static int setup_new_group_blocks(struct super_block *sb,
284 if ((err = extend_or_restart_transaction(handle, 2, bh))) 284 if ((err = extend_or_restart_transaction(handle, 2, bh)))
285 goto exit_bh; 285 goto exit_bh;
286 286
287 mark_bitmap_end(input->blocks_count, EXT4_BLOCKS_PER_GROUP(sb), 287 mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8, bh->b_data);
288 bh->b_data); 288 ext4_handle_dirty_metadata(handle, NULL, bh);
289 ext4_journal_dirty_metadata(handle, bh);
290 brelse(bh); 289 brelse(bh);
291
292 /* Mark unused entries in inode bitmap used */ 290 /* Mark unused entries in inode bitmap used */
293 ext4_debug("clear inode bitmap %#04llx (+%llu)\n", 291 ext4_debug("clear inode bitmap %#04llx (+%llu)\n",
294 input->inode_bitmap, input->inode_bitmap - start); 292 input->inode_bitmap, input->inode_bitmap - start);
@@ -297,9 +295,9 @@ static int setup_new_group_blocks(struct super_block *sb,
297 goto exit_journal; 295 goto exit_journal;
298 } 296 }
299 297
300 mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), EXT4_BLOCKS_PER_GROUP(sb), 298 mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
301 bh->b_data); 299 bh->b_data);
302 ext4_journal_dirty_metadata(handle, bh); 300 ext4_handle_dirty_metadata(handle, NULL, bh);
303exit_bh: 301exit_bh:
304 brelse(bh); 302 brelse(bh);
305 303
@@ -486,12 +484,12 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
486 * reserved inode, and will become GDT blocks (primary and backup). 484 * reserved inode, and will become GDT blocks (primary and backup).
487 */ 485 */
488 data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)] = 0; 486 data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)] = 0;
489 ext4_journal_dirty_metadata(handle, dind); 487 ext4_handle_dirty_metadata(handle, NULL, dind);
490 brelse(dind); 488 brelse(dind);
491 inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9; 489 inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9;
492 ext4_mark_iloc_dirty(handle, inode, &iloc); 490 ext4_mark_iloc_dirty(handle, inode, &iloc);
493 memset((*primary)->b_data, 0, sb->s_blocksize); 491 memset((*primary)->b_data, 0, sb->s_blocksize);
494 ext4_journal_dirty_metadata(handle, *primary); 492 ext4_handle_dirty_metadata(handle, NULL, *primary);
495 493
496 o_group_desc = EXT4_SB(sb)->s_group_desc; 494 o_group_desc = EXT4_SB(sb)->s_group_desc;
497 memcpy(n_group_desc, o_group_desc, 495 memcpy(n_group_desc, o_group_desc,
@@ -502,7 +500,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
502 kfree(o_group_desc); 500 kfree(o_group_desc);
503 501
504 le16_add_cpu(&es->s_reserved_gdt_blocks, -1); 502 le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
505 ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh); 503 ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
506 504
507 return 0; 505 return 0;
508 506
@@ -618,7 +616,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
618 primary[i]->b_blocknr, gdbackups, 616 primary[i]->b_blocknr, gdbackups,
619 blk + primary[i]->b_blocknr); */ 617 blk + primary[i]->b_blocknr); */
620 data[gdbackups] = cpu_to_le32(blk + primary[i]->b_blocknr); 618 data[gdbackups] = cpu_to_le32(blk + primary[i]->b_blocknr);
621 err2 = ext4_journal_dirty_metadata(handle, primary[i]); 619 err2 = ext4_handle_dirty_metadata(handle, NULL, primary[i]);
622 if (!err) 620 if (!err)
623 err = err2; 621 err = err2;
624 } 622 }
@@ -676,7 +674,8 @@ static void update_backups(struct super_block *sb,
676 struct buffer_head *bh; 674 struct buffer_head *bh;
677 675
678 /* Out of journal space, and can't get more - abort - so sad */ 676 /* Out of journal space, and can't get more - abort - so sad */
679 if (handle->h_buffer_credits == 0 && 677 if (ext4_handle_valid(handle) &&
678 handle->h_buffer_credits == 0 &&
680 ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA) && 679 ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA) &&
681 (err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA))) 680 (err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA)))
682 break; 681 break;
@@ -696,7 +695,7 @@ static void update_backups(struct super_block *sb,
696 memset(bh->b_data + size, 0, rest); 695 memset(bh->b_data + size, 0, rest);
697 set_buffer_uptodate(bh); 696 set_buffer_uptodate(bh);
698 unlock_buffer(bh); 697 unlock_buffer(bh);
699 ext4_journal_dirty_metadata(handle, bh); 698 ext4_handle_dirty_metadata(handle, NULL, bh);
700 brelse(bh); 699 brelse(bh);
701 } 700 }
702 if ((err2 = ext4_journal_stop(handle)) && !err) 701 if ((err2 = ext4_journal_stop(handle)) && !err)
@@ -715,7 +714,7 @@ static void update_backups(struct super_block *sb,
715exit_err: 714exit_err:
716 if (err) { 715 if (err) {
717 ext4_warning(sb, __func__, 716 ext4_warning(sb, __func__,
718 "can't update backup for group %lu (err %d), " 717 "can't update backup for group %u (err %d), "
719 "forcing fsck on next reboot", group, err); 718 "forcing fsck on next reboot", group, err);
720 sbi->s_mount_state &= ~EXT4_VALID_FS; 719 sbi->s_mount_state &= ~EXT4_VALID_FS;
721 sbi->s_es->s_state &= cpu_to_le16(~EXT4_VALID_FS); 720 sbi->s_es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
@@ -747,6 +746,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
747 struct inode *inode = NULL; 746 struct inode *inode = NULL;
748 handle_t *handle; 747 handle_t *handle;
749 int gdb_off, gdb_num; 748 int gdb_off, gdb_num;
749 int num_grp_locked = 0;
750 int err, err2; 750 int err, err2;
751 751
752 gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb); 752 gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
@@ -761,13 +761,13 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
761 761
762 if (ext4_blocks_count(es) + input->blocks_count < 762 if (ext4_blocks_count(es) + input->blocks_count <
763 ext4_blocks_count(es)) { 763 ext4_blocks_count(es)) {
764 ext4_warning(sb, __func__, "blocks_count overflow\n"); 764 ext4_warning(sb, __func__, "blocks_count overflow");
765 return -EINVAL; 765 return -EINVAL;
766 } 766 }
767 767
768 if (le32_to_cpu(es->s_inodes_count) + EXT4_INODES_PER_GROUP(sb) < 768 if (le32_to_cpu(es->s_inodes_count) + EXT4_INODES_PER_GROUP(sb) <
769 le32_to_cpu(es->s_inodes_count)) { 769 le32_to_cpu(es->s_inodes_count)) {
770 ext4_warning(sb, __func__, "inodes_count overflow\n"); 770 ext4_warning(sb, __func__, "inodes_count overflow");
771 return -EINVAL; 771 return -EINVAL;
772 } 772 }
773 773
@@ -787,6 +787,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
787 } 787 }
788 } 788 }
789 789
790
790 if ((err = verify_group_input(sb, input))) 791 if ((err = verify_group_input(sb, input)))
791 goto exit_put; 792 goto exit_put;
792 793
@@ -855,24 +856,29 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
855 * using the new disk blocks. 856 * using the new disk blocks.
856 */ 857 */
857 858
859 num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, input->group);
858 /* Update group descriptor block for new group */ 860 /* Update group descriptor block for new group */
859 gdp = (struct ext4_group_desc *)((char *)primary->b_data + 861 gdp = (struct ext4_group_desc *)((char *)primary->b_data +
860 gdb_off * EXT4_DESC_SIZE(sb)); 862 gdb_off * EXT4_DESC_SIZE(sb));
861 863
864 memset(gdp, 0, EXT4_DESC_SIZE(sb));
862 ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */ 865 ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */
863 ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */ 866 ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */
864 ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */ 867 ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */
865 gdp->bg_free_blocks_count = cpu_to_le16(input->free_blocks_count); 868 ext4_free_blks_set(sb, gdp, input->free_blocks_count);
866 gdp->bg_free_inodes_count = cpu_to_le16(EXT4_INODES_PER_GROUP(sb)); 869 ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb));
870 gdp->bg_flags = cpu_to_le16(EXT4_BG_INODE_ZEROED);
867 gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp); 871 gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp);
868 872
869 /* 873 /*
870 * We can allocate memory for mb_alloc based on the new group 874 * We can allocate memory for mb_alloc based on the new group
871 * descriptor 875 * descriptor
872 */ 876 */
873 err = ext4_mb_add_more_groupinfo(sb, input->group, gdp); 877 err = ext4_mb_add_groupinfo(sb, input->group, gdp);
874 if (err) 878 if (err) {
879 ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked);
875 goto exit_journal; 880 goto exit_journal;
881 }
876 882
877 /* 883 /*
878 * Make the new blocks and inodes valid next. We do this before 884 * Make the new blocks and inodes valid next. We do this before
@@ -914,8 +920,9 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
914 920
915 /* Update the global fs size fields */ 921 /* Update the global fs size fields */
916 sbi->s_groups_count++; 922 sbi->s_groups_count++;
923 ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked);
917 924
918 ext4_journal_dirty_metadata(handle, primary); 925 ext4_handle_dirty_metadata(handle, NULL, primary);
919 926
920 /* Update the reserved block counts only once the new group is 927 /* Update the reserved block counts only once the new group is
921 * active. */ 928 * active. */
@@ -937,7 +944,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
937 EXT4_INODES_PER_GROUP(sb); 944 EXT4_INODES_PER_GROUP(sb);
938 } 945 }
939 946
940 ext4_journal_dirty_metadata(handle, sbi->s_sbh); 947 ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
941 sb->s_dirt = 1; 948 sb->s_dirt = 1;
942 949
943exit_journal: 950exit_journal:
@@ -975,9 +982,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
975 struct buffer_head *bh; 982 struct buffer_head *bh;
976 handle_t *handle; 983 handle_t *handle;
977 int err; 984 int err;
978 unsigned long freed_blocks;
979 ext4_group_t group; 985 ext4_group_t group;
980 struct ext4_group_info *grp;
981 986
982 /* We don't need to worry about locking wrt other resizers just 987 /* We don't need to worry about locking wrt other resizers just
983 * yet: we're going to revalidate es->s_blocks_count after 988 * yet: we're going to revalidate es->s_blocks_count after
@@ -997,8 +1002,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
997 " too large to resize to %llu blocks safely\n", 1002 " too large to resize to %llu blocks safely\n",
998 sb->s_id, n_blocks_count); 1003 sb->s_id, n_blocks_count);
999 if (sizeof(sector_t) < 8) 1004 if (sizeof(sector_t) < 8)
1000 ext4_warning(sb, __func__, 1005 ext4_warning(sb, __func__, "CONFIG_LBD not enabled");
1001 "CONFIG_LBD not enabled\n");
1002 return -EINVAL; 1006 return -EINVAL;
1003 } 1007 }
1004 1008
@@ -1071,62 +1075,18 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
1071 goto exit_put; 1075 goto exit_put;
1072 } 1076 }
1073 ext4_blocks_count_set(es, o_blocks_count + add); 1077 ext4_blocks_count_set(es, o_blocks_count + add);
1074 ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh); 1078 ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
1075 sb->s_dirt = 1; 1079 sb->s_dirt = 1;
1076 unlock_super(sb); 1080 unlock_super(sb);
1077 ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count, 1081 ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
1078 o_blocks_count + add); 1082 o_blocks_count + add);
1079 ext4_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks); 1083 /* We add the blocks to the bitmap and set the group need init bit */
1084 ext4_add_groupblocks(handle, sb, o_blocks_count, add);
1080 ext4_debug("freed blocks %llu through %llu\n", o_blocks_count, 1085 ext4_debug("freed blocks %llu through %llu\n", o_blocks_count,
1081 o_blocks_count + add); 1086 o_blocks_count + add);
1082 if ((err = ext4_journal_stop(handle))) 1087 if ((err = ext4_journal_stop(handle)))
1083 goto exit_put; 1088 goto exit_put;
1084 1089
1085 /*
1086 * Mark mballoc pages as not up to date so that they will be updated
1087 * next time they are loaded by ext4_mb_load_buddy.
1088 *
1089 * XXX Bad, Bad, BAD!!! We should not be overloading the
1090 * Uptodate flag, particularly on thte bitmap bh, as way of
1091 * hinting to ext4_mb_load_buddy() that it needs to be
1092 * overloaded. A user could take a LVM snapshot, then do an
1093 * on-line fsck, and clear the uptodate flag, and this would
1094 * not be a bug in userspace, but a bug in the kernel. FIXME!!!
1095 */
1096 {
1097 struct ext4_sb_info *sbi = EXT4_SB(sb);
1098 struct inode *inode = sbi->s_buddy_cache;
1099 int blocks_per_page;
1100 int block;
1101 int pnum;
1102 struct page *page;
1103
1104 /* Set buddy page as not up to date */
1105 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1106 block = group * 2;
1107 pnum = block / blocks_per_page;
1108 page = find_get_page(inode->i_mapping, pnum);
1109 if (page != NULL) {
1110 ClearPageUptodate(page);
1111 page_cache_release(page);
1112 }
1113
1114 /* Set bitmap page as not up to date */
1115 block++;
1116 pnum = block / blocks_per_page;
1117 page = find_get_page(inode->i_mapping, pnum);
1118 if (page != NULL) {
1119 ClearPageUptodate(page);
1120 page_cache_release(page);
1121 }
1122
1123 /* Get the info on the last group */
1124 grp = ext4_get_group_info(sb, group);
1125
1126 /* Update free blocks in group info */
1127 ext4_mb_update_group_info(grp, add);
1128 }
1129
1130 if (test_opt(sb, DEBUG)) 1090 if (test_opt(sb, DEBUG))
1131 printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n", 1091 printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n",
1132 ext4_blocks_count(es)); 1092 ext4_blocks_count(es));
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 04158ad74dbb..e5f06a5f045e 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -51,9 +51,7 @@ struct proc_dir_entry *ext4_proc_root;
51 51
52static int ext4_load_journal(struct super_block *, struct ext4_super_block *, 52static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
53 unsigned long journal_devnum); 53 unsigned long journal_devnum);
54static int ext4_create_journal(struct super_block *, struct ext4_super_block *, 54static int ext4_commit_super(struct super_block *sb,
55 unsigned int);
56static void ext4_commit_super(struct super_block *sb,
57 struct ext4_super_block *es, int sync); 55 struct ext4_super_block *es, int sync);
58static void ext4_mark_recovery_complete(struct super_block *sb, 56static void ext4_mark_recovery_complete(struct super_block *sb,
59 struct ext4_super_block *es); 57 struct ext4_super_block *es);
@@ -64,9 +62,9 @@ static const char *ext4_decode_error(struct super_block *sb, int errno,
64 char nbuf[16]); 62 char nbuf[16]);
65static int ext4_remount(struct super_block *sb, int *flags, char *data); 63static int ext4_remount(struct super_block *sb, int *flags, char *data);
66static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf); 64static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
67static void ext4_unlockfs(struct super_block *sb); 65static int ext4_unfreeze(struct super_block *sb);
68static void ext4_write_super(struct super_block *sb); 66static void ext4_write_super(struct super_block *sb);
69static void ext4_write_super_lockfs(struct super_block *sb); 67static int ext4_freeze(struct super_block *sb);
70 68
71 69
72ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, 70ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
@@ -93,6 +91,38 @@ ext4_fsblk_t ext4_inode_table(struct super_block *sb,
93 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0); 91 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
94} 92}
95 93
94__u32 ext4_free_blks_count(struct super_block *sb,
95 struct ext4_group_desc *bg)
96{
97 return le16_to_cpu(bg->bg_free_blocks_count_lo) |
98 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
99 (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0);
100}
101
102__u32 ext4_free_inodes_count(struct super_block *sb,
103 struct ext4_group_desc *bg)
104{
105 return le16_to_cpu(bg->bg_free_inodes_count_lo) |
106 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
107 (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0);
108}
109
110__u32 ext4_used_dirs_count(struct super_block *sb,
111 struct ext4_group_desc *bg)
112{
113 return le16_to_cpu(bg->bg_used_dirs_count_lo) |
114 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
115 (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0);
116}
117
118__u32 ext4_itable_unused_count(struct super_block *sb,
119 struct ext4_group_desc *bg)
120{
121 return le16_to_cpu(bg->bg_itable_unused_lo) |
122 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
123 (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0);
124}
125
96void ext4_block_bitmap_set(struct super_block *sb, 126void ext4_block_bitmap_set(struct super_block *sb,
97 struct ext4_group_desc *bg, ext4_fsblk_t blk) 127 struct ext4_group_desc *bg, ext4_fsblk_t blk)
98{ 128{
@@ -117,6 +147,38 @@ void ext4_inode_table_set(struct super_block *sb,
117 bg->bg_inode_table_hi = cpu_to_le32(blk >> 32); 147 bg->bg_inode_table_hi = cpu_to_le32(blk >> 32);
118} 148}
119 149
150void ext4_free_blks_set(struct super_block *sb,
151 struct ext4_group_desc *bg, __u32 count)
152{
153 bg->bg_free_blocks_count_lo = cpu_to_le16((__u16)count);
154 if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
155 bg->bg_free_blocks_count_hi = cpu_to_le16(count >> 16);
156}
157
158void ext4_free_inodes_set(struct super_block *sb,
159 struct ext4_group_desc *bg, __u32 count)
160{
161 bg->bg_free_inodes_count_lo = cpu_to_le16((__u16)count);
162 if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
163 bg->bg_free_inodes_count_hi = cpu_to_le16(count >> 16);
164}
165
166void ext4_used_dirs_set(struct super_block *sb,
167 struct ext4_group_desc *bg, __u32 count)
168{
169 bg->bg_used_dirs_count_lo = cpu_to_le16((__u16)count);
170 if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
171 bg->bg_used_dirs_count_hi = cpu_to_le16(count >> 16);
172}
173
174void ext4_itable_unused_set(struct super_block *sb,
175 struct ext4_group_desc *bg, __u32 count)
176{
177 bg->bg_itable_unused_lo = cpu_to_le16((__u16)count);
178 if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
179 bg->bg_itable_unused_hi = cpu_to_le16(count >> 16);
180}
181
120/* 182/*
121 * Wrappers for jbd2_journal_start/end. 183 * Wrappers for jbd2_journal_start/end.
122 * 184 *
@@ -136,13 +198,19 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
136 * backs (eg. EIO in the commit thread), then we still need to 198 * backs (eg. EIO in the commit thread), then we still need to
137 * take the FS itself readonly cleanly. */ 199 * take the FS itself readonly cleanly. */
138 journal = EXT4_SB(sb)->s_journal; 200 journal = EXT4_SB(sb)->s_journal;
139 if (is_journal_aborted(journal)) { 201 if (journal) {
140 ext4_abort(sb, __func__, 202 if (is_journal_aborted(journal)) {
141 "Detected aborted journal"); 203 ext4_abort(sb, __func__,
142 return ERR_PTR(-EROFS); 204 "Detected aborted journal");
205 return ERR_PTR(-EROFS);
206 }
207 return jbd2_journal_start(journal, nblocks);
143 } 208 }
144 209 /*
145 return jbd2_journal_start(journal, nblocks); 210 * We're not journaling, return the appropriate indication.
211 */
212 current->journal_info = EXT4_NOJOURNAL_HANDLE;
213 return current->journal_info;
146} 214}
147 215
148/* 216/*
@@ -157,6 +225,14 @@ int __ext4_journal_stop(const char *where, handle_t *handle)
157 int err; 225 int err;
158 int rc; 226 int rc;
159 227
228 if (!ext4_handle_valid(handle)) {
229 /*
230 * Do this here since we don't call jbd2_journal_stop() in
231 * no-journal mode.
232 */
233 current->journal_info = NULL;
234 return 0;
235 }
160 sb = handle->h_transaction->t_journal->j_private; 236 sb = handle->h_transaction->t_journal->j_private;
161 err = handle->h_err; 237 err = handle->h_err;
162 rc = jbd2_journal_stop(handle); 238 rc = jbd2_journal_stop(handle);
@@ -174,6 +250,8 @@ void ext4_journal_abort_handle(const char *caller, const char *err_fn,
174 char nbuf[16]; 250 char nbuf[16];
175 const char *errstr = ext4_decode_error(NULL, err, nbuf); 251 const char *errstr = ext4_decode_error(NULL, err, nbuf);
176 252
253 BUG_ON(!ext4_handle_valid(handle));
254
177 if (bh) 255 if (bh)
178 BUFFER_TRACE(bh, "abort"); 256 BUFFER_TRACE(bh, "abort");
179 257
@@ -350,6 +428,44 @@ void ext4_warning(struct super_block *sb, const char *function,
350 va_end(args); 428 va_end(args);
351} 429}
352 430
431void ext4_grp_locked_error(struct super_block *sb, ext4_group_t grp,
432 const char *function, const char *fmt, ...)
433__releases(bitlock)
434__acquires(bitlock)
435{
436 va_list args;
437 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
438
439 va_start(args, fmt);
440 printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function);
441 vprintk(fmt, args);
442 printk("\n");
443 va_end(args);
444
445 if (test_opt(sb, ERRORS_CONT)) {
446 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
447 es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
448 ext4_commit_super(sb, es, 0);
449 return;
450 }
451 ext4_unlock_group(sb, grp);
452 ext4_handle_error(sb);
453 /*
454 * We only get here in the ERRORS_RO case; relocking the group
455 * may be dangerous, but nothing bad will happen since the
456 * filesystem will have already been marked read/only and the
457 * journal has been aborted. We return 1 as a hint to callers
458 * who might what to use the return value from
459 * ext4_grp_locked_error() to distinguish beween the
460 * ERRORS_CONT and ERRORS_RO case, and perhaps return more
461 * aggressively from the ext4 function in question, with a
462 * more appropriate error code.
463 */
464 ext4_lock_group(sb, grp);
465 return;
466}
467
468
353void ext4_update_dynamic_rev(struct super_block *sb) 469void ext4_update_dynamic_rev(struct super_block *sb)
354{ 470{
355 struct ext4_super_block *es = EXT4_SB(sb)->s_es; 471 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
@@ -389,7 +505,7 @@ static struct block_device *ext4_blkdev_get(dev_t dev)
389 return bdev; 505 return bdev;
390 506
391fail: 507fail:
392 printk(KERN_ERR "EXT4: failed to open journal device %s: %ld\n", 508 printk(KERN_ERR "EXT4-fs: failed to open journal device %s: %ld\n",
393 __bdevname(dev, b), PTR_ERR(bdev)); 509 __bdevname(dev, b), PTR_ERR(bdev));
394 return NULL; 510 return NULL;
395} 511}
@@ -448,11 +564,13 @@ static void ext4_put_super(struct super_block *sb)
448 ext4_mb_release(sb); 564 ext4_mb_release(sb);
449 ext4_ext_release(sb); 565 ext4_ext_release(sb);
450 ext4_xattr_put_super(sb); 566 ext4_xattr_put_super(sb);
451 err = jbd2_journal_destroy(sbi->s_journal); 567 if (sbi->s_journal) {
452 sbi->s_journal = NULL; 568 err = jbd2_journal_destroy(sbi->s_journal);
453 if (err < 0) 569 sbi->s_journal = NULL;
454 ext4_abort(sb, __func__, "Couldn't clean up the journal"); 570 if (err < 0)
455 571 ext4_abort(sb, __func__,
572 "Couldn't clean up the journal");
573 }
456 if (!(sb->s_flags & MS_RDONLY)) { 574 if (!(sb->s_flags & MS_RDONLY)) {
457 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 575 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
458 es->s_state = cpu_to_le16(sbi->s_mount_state); 576 es->s_state = cpu_to_le16(sbi->s_mount_state);
@@ -522,6 +640,11 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
522 memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache)); 640 memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
523 INIT_LIST_HEAD(&ei->i_prealloc_list); 641 INIT_LIST_HEAD(&ei->i_prealloc_list);
524 spin_lock_init(&ei->i_prealloc_lock); 642 spin_lock_init(&ei->i_prealloc_lock);
643 /*
644 * Note: We can be called before EXT4_SB(sb)->s_journal is set,
645 * therefore it can be null here. Don't check it, just initialize
646 * jinode.
647 */
525 jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode); 648 jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode);
526 ei->i_reserved_data_blocks = 0; 649 ei->i_reserved_data_blocks = 0;
527 ei->i_reserved_meta_blocks = 0; 650 ei->i_reserved_meta_blocks = 0;
@@ -588,7 +711,8 @@ static void ext4_clear_inode(struct inode *inode)
588 } 711 }
589#endif 712#endif
590 ext4_discard_preallocations(inode); 713 ext4_discard_preallocations(inode);
591 jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal, 714 if (EXT4_JOURNAL(inode))
715 jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
592 &EXT4_I(inode)->jinode); 716 &EXT4_I(inode)->jinode);
593} 717}
594 718
@@ -681,10 +805,19 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
681#endif 805#endif
682 if (!test_opt(sb, RESERVATION)) 806 if (!test_opt(sb, RESERVATION))
683 seq_puts(seq, ",noreservation"); 807 seq_puts(seq, ",noreservation");
684 if (sbi->s_commit_interval) { 808 if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
685 seq_printf(seq, ",commit=%u", 809 seq_printf(seq, ",commit=%u",
686 (unsigned) (sbi->s_commit_interval / HZ)); 810 (unsigned) (sbi->s_commit_interval / HZ));
687 } 811 }
812 if (sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME) {
813 seq_printf(seq, ",min_batch_time=%u",
814 (unsigned) sbi->s_min_batch_time);
815 }
816 if (sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) {
817 seq_printf(seq, ",max_batch_time=%u",
818 (unsigned) sbi->s_min_batch_time);
819 }
820
688 /* 821 /*
689 * We're changing the default of barrier mount option, so 822 * We're changing the default of barrier mount option, so
690 * let's always display its mount state so it's clear what its 823 * let's always display its mount state so it's clear what its
@@ -696,8 +829,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
696 seq_puts(seq, ",journal_async_commit"); 829 seq_puts(seq, ",journal_async_commit");
697 if (test_opt(sb, NOBH)) 830 if (test_opt(sb, NOBH))
698 seq_puts(seq, ",nobh"); 831 seq_puts(seq, ",nobh");
699 if (!test_opt(sb, EXTENTS))
700 seq_puts(seq, ",noextents");
701 if (test_opt(sb, I_VERSION)) 832 if (test_opt(sb, I_VERSION))
702 seq_puts(seq, ",i_version"); 833 seq_puts(seq, ",i_version");
703 if (!test_opt(sb, DELALLOC)) 834 if (!test_opt(sb, DELALLOC))
@@ -772,6 +903,25 @@ static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
772 ext4_nfs_get_inode); 903 ext4_nfs_get_inode);
773} 904}
774 905
906/*
907 * Try to release metadata pages (indirect blocks, directories) which are
908 * mapped via the block device. Since these pages could have journal heads
909 * which would prevent try_to_free_buffers() from freeing them, we must use
910 * jbd2 layer's try_to_free_buffers() function to release them.
911 */
912static int bdev_try_to_free_page(struct super_block *sb, struct page *page, gfp_t wait)
913{
914 journal_t *journal = EXT4_SB(sb)->s_journal;
915
916 WARN_ON(PageChecked(page));
917 if (!page_has_buffers(page))
918 return 0;
919 if (journal)
920 return jbd2_journal_try_to_free_buffers(journal, page,
921 wait & ~__GFP_WAIT);
922 return try_to_free_buffers(page);
923}
924
775#ifdef CONFIG_QUOTA 925#ifdef CONFIG_QUOTA
776#define QTYPE2NAME(t) ((t) == USRQUOTA ? "user" : "group") 926#define QTYPE2NAME(t) ((t) == USRQUOTA ? "user" : "group")
777#define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA)) 927#define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
@@ -803,7 +953,9 @@ static struct dquot_operations ext4_quota_operations = {
803 .acquire_dquot = ext4_acquire_dquot, 953 .acquire_dquot = ext4_acquire_dquot,
804 .release_dquot = ext4_release_dquot, 954 .release_dquot = ext4_release_dquot,
805 .mark_dirty = ext4_mark_dquot_dirty, 955 .mark_dirty = ext4_mark_dquot_dirty,
806 .write_info = ext4_write_info 956 .write_info = ext4_write_info,
957 .alloc_dquot = dquot_alloc,
958 .destroy_dquot = dquot_destroy,
807}; 959};
808 960
809static struct quotactl_ops ext4_qctl_operations = { 961static struct quotactl_ops ext4_qctl_operations = {
@@ -826,8 +978,8 @@ static const struct super_operations ext4_sops = {
826 .put_super = ext4_put_super, 978 .put_super = ext4_put_super,
827 .write_super = ext4_write_super, 979 .write_super = ext4_write_super,
828 .sync_fs = ext4_sync_fs, 980 .sync_fs = ext4_sync_fs,
829 .write_super_lockfs = ext4_write_super_lockfs, 981 .freeze_fs = ext4_freeze,
830 .unlockfs = ext4_unlockfs, 982 .unfreeze_fs = ext4_unfreeze,
831 .statfs = ext4_statfs, 983 .statfs = ext4_statfs,
832 .remount_fs = ext4_remount, 984 .remount_fs = ext4_remount,
833 .clear_inode = ext4_clear_inode, 985 .clear_inode = ext4_clear_inode,
@@ -836,6 +988,7 @@ static const struct super_operations ext4_sops = {
836 .quota_read = ext4_quota_read, 988 .quota_read = ext4_quota_read,
837 .quota_write = ext4_quota_write, 989 .quota_write = ext4_quota_write,
838#endif 990#endif
991 .bdev_try_to_free_page = bdev_try_to_free_page,
839}; 992};
840 993
841static const struct export_operations ext4_export_ops = { 994static const struct export_operations ext4_export_ops = {
@@ -850,16 +1003,17 @@ enum {
850 Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov, 1003 Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov,
851 Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, 1004 Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
852 Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh, 1005 Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
853 Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev, 1006 Opt_commit, Opt_min_batch_time, Opt_max_batch_time,
1007 Opt_journal_update, Opt_journal_dev,
854 Opt_journal_checksum, Opt_journal_async_commit, 1008 Opt_journal_checksum, Opt_journal_async_commit,
855 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, 1009 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
856 Opt_data_err_abort, Opt_data_err_ignore, 1010 Opt_data_err_abort, Opt_data_err_ignore,
857 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, 1011 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
858 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, 1012 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
859 Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, 1013 Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
860 Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version, 1014 Opt_grpquota, Opt_i_version,
861 Opt_stripe, Opt_delalloc, Opt_nodelalloc, 1015 Opt_stripe, Opt_delalloc, Opt_nodelalloc,
862 Opt_inode_readahead_blks 1016 Opt_inode_readahead_blks, Opt_journal_ioprio
863}; 1017};
864 1018
865static const match_table_t tokens = { 1019static const match_table_t tokens = {
@@ -889,8 +1043,9 @@ static const match_table_t tokens = {
889 {Opt_nobh, "nobh"}, 1043 {Opt_nobh, "nobh"},
890 {Opt_bh, "bh"}, 1044 {Opt_bh, "bh"},
891 {Opt_commit, "commit=%u"}, 1045 {Opt_commit, "commit=%u"},
1046 {Opt_min_batch_time, "min_batch_time=%u"},
1047 {Opt_max_batch_time, "max_batch_time=%u"},
892 {Opt_journal_update, "journal=update"}, 1048 {Opt_journal_update, "journal=update"},
893 {Opt_journal_inum, "journal=%u"},
894 {Opt_journal_dev, "journal_dev=%u"}, 1049 {Opt_journal_dev, "journal_dev=%u"},
895 {Opt_journal_checksum, "journal_checksum"}, 1050 {Opt_journal_checksum, "journal_checksum"},
896 {Opt_journal_async_commit, "journal_async_commit"}, 1051 {Opt_journal_async_commit, "journal_async_commit"},
@@ -911,14 +1066,13 @@ static const match_table_t tokens = {
911 {Opt_quota, "quota"}, 1066 {Opt_quota, "quota"},
912 {Opt_usrquota, "usrquota"}, 1067 {Opt_usrquota, "usrquota"},
913 {Opt_barrier, "barrier=%u"}, 1068 {Opt_barrier, "barrier=%u"},
914 {Opt_extents, "extents"},
915 {Opt_noextents, "noextents"},
916 {Opt_i_version, "i_version"}, 1069 {Opt_i_version, "i_version"},
917 {Opt_stripe, "stripe=%u"}, 1070 {Opt_stripe, "stripe=%u"},
918 {Opt_resize, "resize"}, 1071 {Opt_resize, "resize"},
919 {Opt_delalloc, "delalloc"}, 1072 {Opt_delalloc, "delalloc"},
920 {Opt_nodelalloc, "nodelalloc"}, 1073 {Opt_nodelalloc, "nodelalloc"},
921 {Opt_inode_readahead_blks, "inode_readahead_blks=%u"}, 1074 {Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
1075 {Opt_journal_ioprio, "journal_ioprio=%u"},
922 {Opt_err, NULL}, 1076 {Opt_err, NULL},
923}; 1077};
924 1078
@@ -943,8 +1097,11 @@ static ext4_fsblk_t get_sb_block(void **data)
943 return sb_block; 1097 return sb_block;
944} 1098}
945 1099
1100#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
1101
946static int parse_options(char *options, struct super_block *sb, 1102static int parse_options(char *options, struct super_block *sb,
947 unsigned int *inum, unsigned long *journal_devnum, 1103 unsigned long *journal_devnum,
1104 unsigned int *journal_ioprio,
948 ext4_fsblk_t *n_blocks_count, int is_remount) 1105 ext4_fsblk_t *n_blocks_count, int is_remount)
949{ 1106{
950 struct ext4_sb_info *sbi = EXT4_SB(sb); 1107 struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -956,7 +1113,6 @@ static int parse_options(char *options, struct super_block *sb,
956 int qtype, qfmt; 1113 int qtype, qfmt;
957 char *qname; 1114 char *qname;
958#endif 1115#endif
959 ext4_fsblk_t last_block;
960 1116
961 if (!options) 1117 if (!options)
962 return 1; 1118 return 1;
@@ -1068,16 +1224,6 @@ static int parse_options(char *options, struct super_block *sb,
1068 } 1224 }
1069 set_opt(sbi->s_mount_opt, UPDATE_JOURNAL); 1225 set_opt(sbi->s_mount_opt, UPDATE_JOURNAL);
1070 break; 1226 break;
1071 case Opt_journal_inum:
1072 if (is_remount) {
1073 printk(KERN_ERR "EXT4-fs: cannot specify "
1074 "journal on remount\n");
1075 return 0;
1076 }
1077 if (match_int(&args[0], &option))
1078 return 0;
1079 *inum = option;
1080 break;
1081 case Opt_journal_dev: 1227 case Opt_journal_dev:
1082 if (is_remount) { 1228 if (is_remount) {
1083 printk(KERN_ERR "EXT4-fs: cannot specify " 1229 printk(KERN_ERR "EXT4-fs: cannot specify "
@@ -1107,6 +1253,22 @@ static int parse_options(char *options, struct super_block *sb,
1107 option = JBD2_DEFAULT_MAX_COMMIT_AGE; 1253 option = JBD2_DEFAULT_MAX_COMMIT_AGE;
1108 sbi->s_commit_interval = HZ * option; 1254 sbi->s_commit_interval = HZ * option;
1109 break; 1255 break;
1256 case Opt_max_batch_time:
1257 if (match_int(&args[0], &option))
1258 return 0;
1259 if (option < 0)
1260 return 0;
1261 if (option == 0)
1262 option = EXT4_DEF_MAX_BATCH_TIME;
1263 sbi->s_max_batch_time = option;
1264 break;
1265 case Opt_min_batch_time:
1266 if (match_int(&args[0], &option))
1267 return 0;
1268 if (option < 0)
1269 return 0;
1270 sbi->s_min_batch_time = option;
1271 break;
1110 case Opt_data_journal: 1272 case Opt_data_journal:
1111 data_opt = EXT4_MOUNT_JOURNAL_DATA; 1273 data_opt = EXT4_MOUNT_JOURNAL_DATA;
1112 goto datacheck; 1274 goto datacheck;
@@ -1142,8 +1304,7 @@ static int parse_options(char *options, struct super_block *sb,
1142 case Opt_grpjquota: 1304 case Opt_grpjquota:
1143 qtype = GRPQUOTA; 1305 qtype = GRPQUOTA;
1144set_qf_name: 1306set_qf_name:
1145 if ((sb_any_quota_enabled(sb) || 1307 if (sb_any_quota_loaded(sb) &&
1146 sb_any_quota_suspended(sb)) &&
1147 !sbi->s_qf_names[qtype]) { 1308 !sbi->s_qf_names[qtype]) {
1148 printk(KERN_ERR 1309 printk(KERN_ERR
1149 "EXT4-fs: Cannot change journaled " 1310 "EXT4-fs: Cannot change journaled "
@@ -1182,8 +1343,7 @@ set_qf_name:
1182 case Opt_offgrpjquota: 1343 case Opt_offgrpjquota:
1183 qtype = GRPQUOTA; 1344 qtype = GRPQUOTA;
1184clear_qf_name: 1345clear_qf_name:
1185 if ((sb_any_quota_enabled(sb) || 1346 if (sb_any_quota_loaded(sb) &&
1186 sb_any_quota_suspended(sb)) &&
1187 sbi->s_qf_names[qtype]) { 1347 sbi->s_qf_names[qtype]) {
1188 printk(KERN_ERR "EXT4-fs: Cannot change " 1348 printk(KERN_ERR "EXT4-fs: Cannot change "
1189 "journaled quota options when " 1349 "journaled quota options when "
@@ -1202,8 +1362,7 @@ clear_qf_name:
1202 case Opt_jqfmt_vfsv0: 1362 case Opt_jqfmt_vfsv0:
1203 qfmt = QFMT_VFS_V0; 1363 qfmt = QFMT_VFS_V0;
1204set_qf_format: 1364set_qf_format:
1205 if ((sb_any_quota_enabled(sb) || 1365 if (sb_any_quota_loaded(sb) &&
1206 sb_any_quota_suspended(sb)) &&
1207 sbi->s_jquota_fmt != qfmt) { 1366 sbi->s_jquota_fmt != qfmt) {
1208 printk(KERN_ERR "EXT4-fs: Cannot change " 1367 printk(KERN_ERR "EXT4-fs: Cannot change "
1209 "journaled quota options when " 1368 "journaled quota options when "
@@ -1222,7 +1381,7 @@ set_qf_format:
1222 set_opt(sbi->s_mount_opt, GRPQUOTA); 1381 set_opt(sbi->s_mount_opt, GRPQUOTA);
1223 break; 1382 break;
1224 case Opt_noquota: 1383 case Opt_noquota:
1225 if (sb_any_quota_enabled(sb)) { 1384 if (sb_any_quota_loaded(sb)) {
1226 printk(KERN_ERR "EXT4-fs: Cannot change quota " 1385 printk(KERN_ERR "EXT4-fs: Cannot change quota "
1227 "options when quota turned on.\n"); 1386 "options when quota turned on.\n");
1228 return 0; 1387 return 0;
@@ -1280,33 +1439,6 @@ set_qf_format:
1280 case Opt_bh: 1439 case Opt_bh:
1281 clear_opt(sbi->s_mount_opt, NOBH); 1440 clear_opt(sbi->s_mount_opt, NOBH);
1282 break; 1441 break;
1283 case Opt_extents:
1284 if (!EXT4_HAS_INCOMPAT_FEATURE(sb,
1285 EXT4_FEATURE_INCOMPAT_EXTENTS)) {
1286 ext4_warning(sb, __func__,
1287 "extents feature not enabled "
1288 "on this filesystem, use tune2fs\n");
1289 return 0;
1290 }
1291 set_opt(sbi->s_mount_opt, EXTENTS);
1292 break;
1293 case Opt_noextents:
1294 /*
1295 * When e2fsprogs support resizing an already existing
1296 * ext3 file system to greater than 2**32 we need to
1297 * add support to block allocator to handle growing
1298 * already existing block mapped inode so that blocks
1299 * allocated for them fall within 2**32
1300 */
1301 last_block = ext4_blocks_count(sbi->s_es) - 1;
1302 if (last_block > 0xffffffffULL) {
1303 printk(KERN_ERR "EXT4-fs: Filesystem too "
1304 "large to mount with "
1305 "-o noextents options\n");
1306 return 0;
1307 }
1308 clear_opt(sbi->s_mount_opt, EXTENTS);
1309 break;
1310 case Opt_i_version: 1442 case Opt_i_version:
1311 set_opt(sbi->s_mount_opt, I_VERSION); 1443 set_opt(sbi->s_mount_opt, I_VERSION);
1312 sb->s_flags |= MS_I_VERSION; 1444 sb->s_flags |= MS_I_VERSION;
@@ -1331,6 +1463,14 @@ set_qf_format:
1331 return 0; 1463 return 0;
1332 sbi->s_inode_readahead_blks = option; 1464 sbi->s_inode_readahead_blks = option;
1333 break; 1465 break;
1466 case Opt_journal_ioprio:
1467 if (match_int(&args[0], &option))
1468 return 0;
1469 if (option < 0 || option > 7)
1470 break;
1471 *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE,
1472 option);
1473 break;
1334 default: 1474 default:
1335 printk(KERN_ERR 1475 printk(KERN_ERR
1336 "EXT4-fs: Unrecognized mount option \"%s\" " 1476 "EXT4-fs: Unrecognized mount option \"%s\" "
@@ -1406,24 +1546,19 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
1406 printk(KERN_WARNING 1546 printk(KERN_WARNING
1407 "EXT4-fs warning: checktime reached, " 1547 "EXT4-fs warning: checktime reached, "
1408 "running e2fsck is recommended\n"); 1548 "running e2fsck is recommended\n");
1409#if 0 1549 if (!sbi->s_journal)
1410 /* @@@ We _will_ want to clear the valid bit if we find 1550 es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
1411 * inconsistencies, to force a fsck at reboot. But for
1412 * a plain journaled filesystem we can keep it set as
1413 * valid forever! :)
1414 */
1415 es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
1416#endif
1417 if (!(__s16) le16_to_cpu(es->s_max_mnt_count)) 1551 if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
1418 es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT); 1552 es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT);
1419 le16_add_cpu(&es->s_mnt_count, 1); 1553 le16_add_cpu(&es->s_mnt_count, 1);
1420 es->s_mtime = cpu_to_le32(get_seconds()); 1554 es->s_mtime = cpu_to_le32(get_seconds());
1421 ext4_update_dynamic_rev(sb); 1555 ext4_update_dynamic_rev(sb);
1422 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 1556 if (sbi->s_journal)
1557 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
1423 1558
1424 ext4_commit_super(sb, es, 1); 1559 ext4_commit_super(sb, es, 1);
1425 if (test_opt(sb, DEBUG)) 1560 if (test_opt(sb, DEBUG))
1426 printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%lu, " 1561 printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
1427 "bpg=%lu, ipg=%lu, mo=%04lx]\n", 1562 "bpg=%lu, ipg=%lu, mo=%04lx]\n",
1428 sb->s_blocksize, 1563 sb->s_blocksize,
1429 sbi->s_groups_count, 1564 sbi->s_groups_count,
@@ -1431,9 +1566,13 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
1431 EXT4_INODES_PER_GROUP(sb), 1566 EXT4_INODES_PER_GROUP(sb),
1432 sbi->s_mount_opt); 1567 sbi->s_mount_opt);
1433 1568
1434 printk(KERN_INFO "EXT4 FS on %s, %s journal on %s\n", 1569 if (EXT4_SB(sb)->s_journal) {
1435 sb->s_id, EXT4_SB(sb)->s_journal->j_inode ? "internal" : 1570 printk(KERN_INFO "EXT4 FS on %s, %s journal on %s\n",
1436 "external", EXT4_SB(sb)->s_journal->j_devname); 1571 sb->s_id, EXT4_SB(sb)->s_journal->j_inode ? "internal" :
1572 "external", EXT4_SB(sb)->s_journal->j_devname);
1573 } else {
1574 printk(KERN_INFO "EXT4 FS on %s, no journal\n", sb->s_id);
1575 }
1437 return res; 1576 return res;
1438} 1577}
1439 1578
@@ -1445,7 +1584,6 @@ static int ext4_fill_flex_info(struct super_block *sb)
1445 ext4_group_t flex_group_count; 1584 ext4_group_t flex_group_count;
1446 ext4_group_t flex_group; 1585 ext4_group_t flex_group;
1447 int groups_per_flex = 0; 1586 int groups_per_flex = 0;
1448 __u64 block_bitmap = 0;
1449 int i; 1587 int i;
1450 1588
1451 if (!sbi->s_es->s_log_groups_per_flex) { 1589 if (!sbi->s_es->s_log_groups_per_flex) {
@@ -1464,21 +1602,18 @@ static int ext4_fill_flex_info(struct super_block *sb)
1464 sizeof(struct flex_groups), GFP_KERNEL); 1602 sizeof(struct flex_groups), GFP_KERNEL);
1465 if (sbi->s_flex_groups == NULL) { 1603 if (sbi->s_flex_groups == NULL) {
1466 printk(KERN_ERR "EXT4-fs: not enough memory for " 1604 printk(KERN_ERR "EXT4-fs: not enough memory for "
1467 "%lu flex groups\n", flex_group_count); 1605 "%u flex groups\n", flex_group_count);
1468 goto failed; 1606 goto failed;
1469 } 1607 }
1470 1608
1471 gdp = ext4_get_group_desc(sb, 1, &bh);
1472 block_bitmap = ext4_block_bitmap(sb, gdp) - 1;
1473
1474 for (i = 0; i < sbi->s_groups_count; i++) { 1609 for (i = 0; i < sbi->s_groups_count; i++) {
1475 gdp = ext4_get_group_desc(sb, i, &bh); 1610 gdp = ext4_get_group_desc(sb, i, &bh);
1476 1611
1477 flex_group = ext4_flex_group(sbi, i); 1612 flex_group = ext4_flex_group(sbi, i);
1478 sbi->s_flex_groups[flex_group].free_inodes += 1613 sbi->s_flex_groups[flex_group].free_inodes +=
1479 le16_to_cpu(gdp->bg_free_inodes_count); 1614 ext4_free_inodes_count(sb, gdp);
1480 sbi->s_flex_groups[flex_group].free_blocks += 1615 sbi->s_flex_groups[flex_group].free_blocks +=
1481 le16_to_cpu(gdp->bg_free_blocks_count); 1616 ext4_free_blks_count(sb, gdp);
1482 } 1617 }
1483 1618
1484 return 1; 1619 return 1;
@@ -1552,14 +1687,14 @@ static int ext4_check_descriptors(struct super_block *sb)
1552 block_bitmap = ext4_block_bitmap(sb, gdp); 1687 block_bitmap = ext4_block_bitmap(sb, gdp);
1553 if (block_bitmap < first_block || block_bitmap > last_block) { 1688 if (block_bitmap < first_block || block_bitmap > last_block) {
1554 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " 1689 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
1555 "Block bitmap for group %lu not in group " 1690 "Block bitmap for group %u not in group "
1556 "(block %llu)!\n", i, block_bitmap); 1691 "(block %llu)!\n", i, block_bitmap);
1557 return 0; 1692 return 0;
1558 } 1693 }
1559 inode_bitmap = ext4_inode_bitmap(sb, gdp); 1694 inode_bitmap = ext4_inode_bitmap(sb, gdp);
1560 if (inode_bitmap < first_block || inode_bitmap > last_block) { 1695 if (inode_bitmap < first_block || inode_bitmap > last_block) {
1561 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " 1696 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
1562 "Inode bitmap for group %lu not in group " 1697 "Inode bitmap for group %u not in group "
1563 "(block %llu)!\n", i, inode_bitmap); 1698 "(block %llu)!\n", i, inode_bitmap);
1564 return 0; 1699 return 0;
1565 } 1700 }
@@ -1567,14 +1702,14 @@ static int ext4_check_descriptors(struct super_block *sb)
1567 if (inode_table < first_block || 1702 if (inode_table < first_block ||
1568 inode_table + sbi->s_itb_per_group - 1 > last_block) { 1703 inode_table + sbi->s_itb_per_group - 1 > last_block) {
1569 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " 1704 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
1570 "Inode table for group %lu not in group " 1705 "Inode table for group %u not in group "
1571 "(block %llu)!\n", i, inode_table); 1706 "(block %llu)!\n", i, inode_table);
1572 return 0; 1707 return 0;
1573 } 1708 }
1574 spin_lock(sb_bgl_lock(sbi, i)); 1709 spin_lock(sb_bgl_lock(sbi, i));
1575 if (!ext4_group_desc_csum_verify(sbi, i, gdp)) { 1710 if (!ext4_group_desc_csum_verify(sbi, i, gdp)) {
1576 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " 1711 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
1577 "Checksum for group %lu failed (%u!=%u)\n", 1712 "Checksum for group %u failed (%u!=%u)\n",
1578 i, le16_to_cpu(ext4_group_desc_csum(sbi, i, 1713 i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
1579 gdp)), le16_to_cpu(gdp->bg_checksum)); 1714 gdp)), le16_to_cpu(gdp->bg_checksum));
1580 if (!(sb->s_flags & MS_RDONLY)) { 1715 if (!(sb->s_flags & MS_RDONLY)) {
@@ -1866,19 +2001,20 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
1866 ext4_fsblk_t sb_block = get_sb_block(&data); 2001 ext4_fsblk_t sb_block = get_sb_block(&data);
1867 ext4_fsblk_t logical_sb_block; 2002 ext4_fsblk_t logical_sb_block;
1868 unsigned long offset = 0; 2003 unsigned long offset = 0;
1869 unsigned int journal_inum = 0;
1870 unsigned long journal_devnum = 0; 2004 unsigned long journal_devnum = 0;
1871 unsigned long def_mount_opts; 2005 unsigned long def_mount_opts;
1872 struct inode *root; 2006 struct inode *root;
1873 char *cp; 2007 char *cp;
2008 const char *descr;
1874 int ret = -EINVAL; 2009 int ret = -EINVAL;
1875 int blocksize; 2010 int blocksize;
1876 int db_count; 2011 unsigned int db_count;
1877 int i; 2012 unsigned int i;
1878 int needs_recovery, has_huge_files; 2013 int needs_recovery, has_huge_files;
1879 __le32 features; 2014 int features;
1880 __u64 blocks_count; 2015 __u64 blocks_count;
1881 int err; 2016 int err;
2017 unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
1882 2018
1883 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); 2019 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
1884 if (!sbi) 2020 if (!sbi)
@@ -1959,31 +2095,22 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
1959 2095
1960 sbi->s_resuid = le16_to_cpu(es->s_def_resuid); 2096 sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
1961 sbi->s_resgid = le16_to_cpu(es->s_def_resgid); 2097 sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
2098 sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
2099 sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
2100 sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
1962 2101
1963 set_opt(sbi->s_mount_opt, RESERVATION); 2102 set_opt(sbi->s_mount_opt, RESERVATION);
1964 set_opt(sbi->s_mount_opt, BARRIER); 2103 set_opt(sbi->s_mount_opt, BARRIER);
1965 2104
1966 /* 2105 /*
1967 * turn on extents feature by default in ext4 filesystem
1968 * only if feature flag already set by mkfs or tune2fs.
1969 * Use -o noextents to turn it off
1970 */
1971 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
1972 set_opt(sbi->s_mount_opt, EXTENTS);
1973 else
1974 ext4_warning(sb, __func__,
1975 "extents feature not enabled on this filesystem, "
1976 "use tune2fs.\n");
1977
1978 /*
1979 * enable delayed allocation by default 2106 * enable delayed allocation by default
1980 * Use -o nodelalloc to turn it off 2107 * Use -o nodelalloc to turn it off
1981 */ 2108 */
1982 set_opt(sbi->s_mount_opt, DELALLOC); 2109 set_opt(sbi->s_mount_opt, DELALLOC);
1983 2110
1984 2111
1985 if (!parse_options((char *) data, sb, &journal_inum, &journal_devnum, 2112 if (!parse_options((char *) data, sb, &journal_devnum,
1986 NULL, 0)) 2113 &journal_ioprio, NULL, 0))
1987 goto failed_mount; 2114 goto failed_mount;
1988 2115
1989 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | 2116 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
@@ -2005,15 +2132,17 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2005 features = EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP); 2132 features = EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP);
2006 if (features) { 2133 if (features) {
2007 printk(KERN_ERR "EXT4-fs: %s: couldn't mount because of " 2134 printk(KERN_ERR "EXT4-fs: %s: couldn't mount because of "
2008 "unsupported optional features (%x).\n", 2135 "unsupported optional features (%x).\n", sb->s_id,
2009 sb->s_id, le32_to_cpu(features)); 2136 (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
2137 ~EXT4_FEATURE_INCOMPAT_SUPP));
2010 goto failed_mount; 2138 goto failed_mount;
2011 } 2139 }
2012 features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP); 2140 features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP);
2013 if (!(sb->s_flags & MS_RDONLY) && features) { 2141 if (!(sb->s_flags & MS_RDONLY) && features) {
2014 printk(KERN_ERR "EXT4-fs: %s: couldn't mount RDWR because of " 2142 printk(KERN_ERR "EXT4-fs: %s: couldn't mount RDWR because of "
2015 "unsupported optional features (%x).\n", 2143 "unsupported optional features (%x).\n", sb->s_id,
2016 sb->s_id, le32_to_cpu(features)); 2144 (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
2145 ~EXT4_FEATURE_RO_COMPAT_SUPP));
2017 goto failed_mount; 2146 goto failed_mount;
2018 } 2147 }
2019 has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb, 2148 has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
@@ -2118,6 +2247,18 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2118 for (i = 0; i < 4; i++) 2247 for (i = 0; i < 4; i++)
2119 sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); 2248 sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
2120 sbi->s_def_hash_version = es->s_def_hash_version; 2249 sbi->s_def_hash_version = es->s_def_hash_version;
2250 i = le32_to_cpu(es->s_flags);
2251 if (i & EXT2_FLAGS_UNSIGNED_HASH)
2252 sbi->s_hash_unsigned = 3;
2253 else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
2254#ifdef __CHAR_UNSIGNED__
2255 es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
2256 sbi->s_hash_unsigned = 3;
2257#else
2258 es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
2259#endif
2260 sb->s_dirt = 1;
2261 }
2121 2262
2122 if (sbi->s_blocks_per_group > blocksize * 8) { 2263 if (sbi->s_blocks_per_group > blocksize * 8) {
2123 printk(KERN_ERR 2264 printk(KERN_ERR
@@ -2145,20 +2286,30 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2145 if (EXT4_BLOCKS_PER_GROUP(sb) == 0) 2286 if (EXT4_BLOCKS_PER_GROUP(sb) == 0)
2146 goto cantfind_ext4; 2287 goto cantfind_ext4;
2147 2288
2148 /* ensure blocks_count calculation below doesn't sign-extend */ 2289 /*
2149 if (ext4_blocks_count(es) + EXT4_BLOCKS_PER_GROUP(sb) < 2290 * It makes no sense for the first data block to be beyond the end
2150 le32_to_cpu(es->s_first_data_block) + 1) { 2291 * of the filesystem.
2151 printk(KERN_WARNING "EXT4-fs: bad geometry: block count %llu, " 2292 */
2152 "first data block %u, blocks per group %lu\n", 2293 if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
2153 ext4_blocks_count(es), 2294 printk(KERN_WARNING "EXT4-fs: bad geometry: first data"
2154 le32_to_cpu(es->s_first_data_block), 2295 "block %u is beyond end of filesystem (%llu)\n",
2155 EXT4_BLOCKS_PER_GROUP(sb)); 2296 le32_to_cpu(es->s_first_data_block),
2297 ext4_blocks_count(es));
2156 goto failed_mount; 2298 goto failed_mount;
2157 } 2299 }
2158 blocks_count = (ext4_blocks_count(es) - 2300 blocks_count = (ext4_blocks_count(es) -
2159 le32_to_cpu(es->s_first_data_block) + 2301 le32_to_cpu(es->s_first_data_block) +
2160 EXT4_BLOCKS_PER_GROUP(sb) - 1); 2302 EXT4_BLOCKS_PER_GROUP(sb) - 1);
2161 do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb)); 2303 do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb));
2304 if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) {
2305 printk(KERN_WARNING "EXT4-fs: groups count too large: %u "
2306 "(block count %llu, first data block %u, "
2307 "blocks per group %lu)\n", sbi->s_groups_count,
2308 ext4_blocks_count(es),
2309 le32_to_cpu(es->s_first_data_block),
2310 EXT4_BLOCKS_PER_GROUP(sb));
2311 goto failed_mount;
2312 }
2162 sbi->s_groups_count = blocks_count; 2313 sbi->s_groups_count = blocks_count;
2163 db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / 2314 db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
2164 EXT4_DESC_PER_BLOCK(sb); 2315 EXT4_DESC_PER_BLOCK(sb);
@@ -2270,27 +2421,26 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2270 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; 2421 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
2271 es->s_state |= cpu_to_le16(EXT4_ERROR_FS); 2422 es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
2272 ext4_commit_super(sb, es, 1); 2423 ext4_commit_super(sb, es, 1);
2273 printk(KERN_CRIT
2274 "EXT4-fs (device %s): mount failed\n",
2275 sb->s_id);
2276 goto failed_mount4; 2424 goto failed_mount4;
2277 } 2425 }
2278 } 2426 }
2279 } else if (journal_inum) { 2427 } else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) &&
2280 if (ext4_create_journal(sb, es, journal_inum)) 2428 EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
2281 goto failed_mount3; 2429 printk(KERN_ERR "EXT4-fs: required journal recovery "
2430 "suppressed and not mounted read-only\n");
2431 goto failed_mount4;
2282 } else { 2432 } else {
2283 if (!silent) 2433 clear_opt(sbi->s_mount_opt, DATA_FLAGS);
2284 printk(KERN_ERR 2434 set_opt(sbi->s_mount_opt, WRITEBACK_DATA);
2285 "ext4: No journal on filesystem on %s\n", 2435 sbi->s_journal = NULL;
2286 sb->s_id); 2436 needs_recovery = 0;
2287 goto failed_mount3; 2437 goto no_journal;
2288 } 2438 }
2289 2439
2290 if (ext4_blocks_count(es) > 0xffffffffULL && 2440 if (ext4_blocks_count(es) > 0xffffffffULL &&
2291 !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, 2441 !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
2292 JBD2_FEATURE_INCOMPAT_64BIT)) { 2442 JBD2_FEATURE_INCOMPAT_64BIT)) {
2293 printk(KERN_ERR "ext4: Failed to set 64-bit journal feature\n"); 2443 printk(KERN_ERR "EXT4-fs: Failed to set 64-bit journal feature\n");
2294 goto failed_mount4; 2444 goto failed_mount4;
2295 } 2445 }
2296 2446
@@ -2335,6 +2485,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2335 default: 2485 default:
2336 break; 2486 break;
2337 } 2487 }
2488 set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
2489
2490no_journal:
2338 2491
2339 if (test_opt(sb, NOBH)) { 2492 if (test_opt(sb, NOBH)) {
2340 if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) { 2493 if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) {
@@ -2420,13 +2573,22 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2420 EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS; 2573 EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
2421 ext4_orphan_cleanup(sb, es); 2574 ext4_orphan_cleanup(sb, es);
2422 EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS; 2575 EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
2423 if (needs_recovery) 2576 if (needs_recovery) {
2424 printk(KERN_INFO "EXT4-fs: recovery complete.\n"); 2577 printk(KERN_INFO "EXT4-fs: recovery complete.\n");
2425 ext4_mark_recovery_complete(sb, es); 2578 ext4_mark_recovery_complete(sb, es);
2426 printk(KERN_INFO "EXT4-fs: mounted filesystem with %s data mode.\n", 2579 }
2427 test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ? "journal": 2580 if (EXT4_SB(sb)->s_journal) {
2428 test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered": 2581 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
2429 "writeback"); 2582 descr = " journalled data mode";
2583 else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
2584 descr = " ordered data mode";
2585 else
2586 descr = " writeback data mode";
2587 } else
2588 descr = "out journal";
2589
2590 printk(KERN_INFO "EXT4-fs: mounted filesystem %s with%s\n",
2591 sb->s_id, descr);
2430 2592
2431 lock_kernel(); 2593 lock_kernel();
2432 return 0; 2594 return 0;
@@ -2438,8 +2600,11 @@ cantfind_ext4:
2438 goto failed_mount; 2600 goto failed_mount;
2439 2601
2440failed_mount4: 2602failed_mount4:
2441 jbd2_journal_destroy(sbi->s_journal); 2603 printk(KERN_ERR "EXT4-fs (device %s): mount failed\n", sb->s_id);
2442 sbi->s_journal = NULL; 2604 if (sbi->s_journal) {
2605 jbd2_journal_destroy(sbi->s_journal);
2606 sbi->s_journal = NULL;
2607 }
2443failed_mount3: 2608failed_mount3:
2444 percpu_counter_destroy(&sbi->s_freeblocks_counter); 2609 percpu_counter_destroy(&sbi->s_freeblocks_counter);
2445 percpu_counter_destroy(&sbi->s_freeinodes_counter); 2610 percpu_counter_destroy(&sbi->s_freeinodes_counter);
@@ -2476,11 +2641,9 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
2476{ 2641{
2477 struct ext4_sb_info *sbi = EXT4_SB(sb); 2642 struct ext4_sb_info *sbi = EXT4_SB(sb);
2478 2643
2479 if (sbi->s_commit_interval) 2644 journal->j_commit_interval = sbi->s_commit_interval;
2480 journal->j_commit_interval = sbi->s_commit_interval; 2645 journal->j_min_batch_time = sbi->s_min_batch_time;
2481 /* We could also set up an ext4-specific default for the commit 2646 journal->j_max_batch_time = sbi->s_max_batch_time;
2482 * interval here, but for now we'll just fall back to the jbd
2483 * default. */
2484 2647
2485 spin_lock(&journal->j_state_lock); 2648 spin_lock(&journal->j_state_lock);
2486 if (test_opt(sb, BARRIER)) 2649 if (test_opt(sb, BARRIER))
@@ -2500,6 +2663,8 @@ static journal_t *ext4_get_journal(struct super_block *sb,
2500 struct inode *journal_inode; 2663 struct inode *journal_inode;
2501 journal_t *journal; 2664 journal_t *journal;
2502 2665
2666 BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
2667
2503 /* First, test for the existence of a valid inode on disk. Bad 2668 /* First, test for the existence of a valid inode on disk. Bad
2504 * things happen if we iget() an unused inode, as the subsequent 2669 * things happen if we iget() an unused inode, as the subsequent
2505 * iput() will try to delete it. */ 2670 * iput() will try to delete it. */
@@ -2548,13 +2713,15 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
2548 struct ext4_super_block *es; 2713 struct ext4_super_block *es;
2549 struct block_device *bdev; 2714 struct block_device *bdev;
2550 2715
2716 BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
2717
2551 bdev = ext4_blkdev_get(j_dev); 2718 bdev = ext4_blkdev_get(j_dev);
2552 if (bdev == NULL) 2719 if (bdev == NULL)
2553 return NULL; 2720 return NULL;
2554 2721
2555 if (bd_claim(bdev, sb)) { 2722 if (bd_claim(bdev, sb)) {
2556 printk(KERN_ERR 2723 printk(KERN_ERR
2557 "EXT4: failed to claim external journal device.\n"); 2724 "EXT4-fs: failed to claim external journal device.\n");
2558 blkdev_put(bdev, FMODE_READ|FMODE_WRITE); 2725 blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
2559 return NULL; 2726 return NULL;
2560 } 2727 }
@@ -2635,6 +2802,8 @@ static int ext4_load_journal(struct super_block *sb,
2635 int err = 0; 2802 int err = 0;
2636 int really_read_only; 2803 int really_read_only;
2637 2804
2805 BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
2806
2638 if (journal_devnum && 2807 if (journal_devnum &&
2639 journal_devnum != le32_to_cpu(es->s_journal_dev)) { 2808 journal_devnum != le32_to_cpu(es->s_journal_dev)) {
2640 printk(KERN_INFO "EXT4-fs: external journal device major/minor " 2809 printk(KERN_INFO "EXT4-fs: external journal device major/minor "
@@ -2719,55 +2888,14 @@ static int ext4_load_journal(struct super_block *sb,
2719 return 0; 2888 return 0;
2720} 2889}
2721 2890
2722static int ext4_create_journal(struct super_block *sb, 2891static int ext4_commit_super(struct super_block *sb,
2723 struct ext4_super_block *es,
2724 unsigned int journal_inum)
2725{
2726 journal_t *journal;
2727 int err;
2728
2729 if (sb->s_flags & MS_RDONLY) {
2730 printk(KERN_ERR "EXT4-fs: readonly filesystem when trying to "
2731 "create journal.\n");
2732 return -EROFS;
2733 }
2734
2735 journal = ext4_get_journal(sb, journal_inum);
2736 if (!journal)
2737 return -EINVAL;
2738
2739 printk(KERN_INFO "EXT4-fs: creating new journal on inode %u\n",
2740 journal_inum);
2741
2742 err = jbd2_journal_create(journal);
2743 if (err) {
2744 printk(KERN_ERR "EXT4-fs: error creating journal.\n");
2745 jbd2_journal_destroy(journal);
2746 return -EIO;
2747 }
2748
2749 EXT4_SB(sb)->s_journal = journal;
2750
2751 ext4_update_dynamic_rev(sb);
2752 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
2753 EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL);
2754
2755 es->s_journal_inum = cpu_to_le32(journal_inum);
2756 sb->s_dirt = 1;
2757
2758 /* Make sure we flush the recovery flag to disk. */
2759 ext4_commit_super(sb, es, 1);
2760
2761 return 0;
2762}
2763
2764static void ext4_commit_super(struct super_block *sb,
2765 struct ext4_super_block *es, int sync) 2892 struct ext4_super_block *es, int sync)
2766{ 2893{
2767 struct buffer_head *sbh = EXT4_SB(sb)->s_sbh; 2894 struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
2895 int error = 0;
2768 2896
2769 if (!sbh) 2897 if (!sbh)
2770 return; 2898 return error;
2771 if (buffer_write_io_error(sbh)) { 2899 if (buffer_write_io_error(sbh)) {
2772 /* 2900 /*
2773 * Oh, dear. A previous attempt to write the 2901 * Oh, dear. A previous attempt to write the
@@ -2777,25 +2905,33 @@ static void ext4_commit_super(struct super_block *sb,
2777 * be remapped. Nothing we can do but to retry the 2905 * be remapped. Nothing we can do but to retry the
2778 * write and hope for the best. 2906 * write and hope for the best.
2779 */ 2907 */
2780 printk(KERN_ERR "ext4: previous I/O error to " 2908 printk(KERN_ERR "EXT4-fs: previous I/O error to "
2781 "superblock detected for %s.\n", sb->s_id); 2909 "superblock detected for %s.\n", sb->s_id);
2782 clear_buffer_write_io_error(sbh); 2910 clear_buffer_write_io_error(sbh);
2783 set_buffer_uptodate(sbh); 2911 set_buffer_uptodate(sbh);
2784 } 2912 }
2785 es->s_wtime = cpu_to_le32(get_seconds()); 2913 es->s_wtime = cpu_to_le32(get_seconds());
2786 ext4_free_blocks_count_set(es, ext4_count_free_blocks(sb)); 2914 ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
2787 es->s_free_inodes_count = cpu_to_le32(ext4_count_free_inodes(sb)); 2915 &EXT4_SB(sb)->s_freeblocks_counter));
2916 es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive(
2917 &EXT4_SB(sb)->s_freeinodes_counter));
2918
2788 BUFFER_TRACE(sbh, "marking dirty"); 2919 BUFFER_TRACE(sbh, "marking dirty");
2789 mark_buffer_dirty(sbh); 2920 mark_buffer_dirty(sbh);
2790 if (sync) { 2921 if (sync) {
2791 sync_dirty_buffer(sbh); 2922 error = sync_dirty_buffer(sbh);
2792 if (buffer_write_io_error(sbh)) { 2923 if (error)
2793 printk(KERN_ERR "ext4: I/O error while writing " 2924 return error;
2925
2926 error = buffer_write_io_error(sbh);
2927 if (error) {
2928 printk(KERN_ERR "EXT4-fs: I/O error while writing "
2794 "superblock for %s.\n", sb->s_id); 2929 "superblock for %s.\n", sb->s_id);
2795 clear_buffer_write_io_error(sbh); 2930 clear_buffer_write_io_error(sbh);
2796 set_buffer_uptodate(sbh); 2931 set_buffer_uptodate(sbh);
2797 } 2932 }
2798 } 2933 }
2934 return error;
2799} 2935}
2800 2936
2801 2937
@@ -2809,6 +2945,10 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
2809{ 2945{
2810 journal_t *journal = EXT4_SB(sb)->s_journal; 2946 journal_t *journal = EXT4_SB(sb)->s_journal;
2811 2947
2948 if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
2949 BUG_ON(journal != NULL);
2950 return;
2951 }
2812 jbd2_journal_lock_updates(journal); 2952 jbd2_journal_lock_updates(journal);
2813 if (jbd2_journal_flush(journal) < 0) 2953 if (jbd2_journal_flush(journal) < 0)
2814 goto out; 2954 goto out;
@@ -2838,6 +2978,8 @@ static void ext4_clear_journal_err(struct super_block *sb,
2838 int j_errno; 2978 int j_errno;
2839 const char *errstr; 2979 const char *errstr;
2840 2980
2981 BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
2982
2841 journal = EXT4_SB(sb)->s_journal; 2983 journal = EXT4_SB(sb)->s_journal;
2842 2984
2843 /* 2985 /*
@@ -2870,14 +3012,17 @@ static void ext4_clear_journal_err(struct super_block *sb,
2870int ext4_force_commit(struct super_block *sb) 3012int ext4_force_commit(struct super_block *sb)
2871{ 3013{
2872 journal_t *journal; 3014 journal_t *journal;
2873 int ret; 3015 int ret = 0;
2874 3016
2875 if (sb->s_flags & MS_RDONLY) 3017 if (sb->s_flags & MS_RDONLY)
2876 return 0; 3018 return 0;
2877 3019
2878 journal = EXT4_SB(sb)->s_journal; 3020 journal = EXT4_SB(sb)->s_journal;
2879 sb->s_dirt = 0; 3021 if (journal) {
2880 ret = ext4_journal_force_commit(journal); 3022 sb->s_dirt = 0;
3023 ret = ext4_journal_force_commit(journal);
3024 }
3025
2881 return ret; 3026 return ret;
2882} 3027}
2883 3028
@@ -2889,9 +3034,13 @@ int ext4_force_commit(struct super_block *sb)
2889 */ 3034 */
2890static void ext4_write_super(struct super_block *sb) 3035static void ext4_write_super(struct super_block *sb)
2891{ 3036{
2892 if (mutex_trylock(&sb->s_lock) != 0) 3037 if (EXT4_SB(sb)->s_journal) {
2893 BUG(); 3038 if (mutex_trylock(&sb->s_lock) != 0)
2894 sb->s_dirt = 0; 3039 BUG();
3040 sb->s_dirt = 0;
3041 } else {
3042 ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
3043 }
2895} 3044}
2896 3045
2897static int ext4_sync_fs(struct super_block *sb, int wait) 3046static int ext4_sync_fs(struct super_block *sb, int wait)
@@ -2900,10 +3049,14 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
2900 3049
2901 trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait); 3050 trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait);
2902 sb->s_dirt = 0; 3051 sb->s_dirt = 0;
2903 if (wait) 3052 if (EXT4_SB(sb)->s_journal) {
2904 ret = ext4_force_commit(sb); 3053 if (wait)
2905 else 3054 ret = ext4_force_commit(sb);
2906 jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, NULL); 3055 else
3056 jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, NULL);
3057 } else {
3058 ext4_commit_super(sb, EXT4_SB(sb)->s_es, wait);
3059 }
2907 return ret; 3060 return ret;
2908} 3061}
2909 3062
@@ -2911,36 +3064,48 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
2911 * LVM calls this function before a (read-only) snapshot is created. This 3064 * LVM calls this function before a (read-only) snapshot is created. This
2912 * gives us a chance to flush the journal completely and mark the fs clean. 3065 * gives us a chance to flush the journal completely and mark the fs clean.
2913 */ 3066 */
2914static void ext4_write_super_lockfs(struct super_block *sb) 3067static int ext4_freeze(struct super_block *sb)
2915{ 3068{
3069 int error = 0;
3070 journal_t *journal;
2916 sb->s_dirt = 0; 3071 sb->s_dirt = 0;
2917 3072
2918 if (!(sb->s_flags & MS_RDONLY)) { 3073 if (!(sb->s_flags & MS_RDONLY)) {
2919 journal_t *journal = EXT4_SB(sb)->s_journal; 3074 journal = EXT4_SB(sb)->s_journal;
2920 3075
2921 /* Now we set up the journal barrier. */ 3076 if (journal) {
2922 jbd2_journal_lock_updates(journal); 3077 /* Now we set up the journal barrier. */
3078 jbd2_journal_lock_updates(journal);
2923 3079
2924 /* 3080 /*
2925 * We don't want to clear needs_recovery flag when we failed 3081 * We don't want to clear needs_recovery flag when we
2926 * to flush the journal. 3082 * failed to flush the journal.
2927 */ 3083 */
2928 if (jbd2_journal_flush(journal) < 0) 3084 error = jbd2_journal_flush(journal);
2929 return; 3085 if (error < 0)
3086 goto out;
3087 }
2930 3088
2931 /* Journal blocked and flushed, clear needs_recovery flag. */ 3089 /* Journal blocked and flushed, clear needs_recovery flag. */
2932 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 3090 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
2933 ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1); 3091 ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
3092 error = ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
3093 if (error)
3094 goto out;
2934 } 3095 }
3096 return 0;
3097out:
3098 jbd2_journal_unlock_updates(journal);
3099 return error;
2935} 3100}
2936 3101
2937/* 3102/*
2938 * Called by LVM after the snapshot is done. We need to reset the RECOVER 3103 * Called by LVM after the snapshot is done. We need to reset the RECOVER
2939 * flag here, even though the filesystem is not technically dirty yet. 3104 * flag here, even though the filesystem is not technically dirty yet.
2940 */ 3105 */
2941static void ext4_unlockfs(struct super_block *sb) 3106static int ext4_unfreeze(struct super_block *sb)
2942{ 3107{
2943 if (!(sb->s_flags & MS_RDONLY)) { 3108 if (EXT4_SB(sb)->s_journal && !(sb->s_flags & MS_RDONLY)) {
2944 lock_super(sb); 3109 lock_super(sb);
2945 /* Reser the needs_recovery flag before the fs is unlocked. */ 3110 /* Reser the needs_recovery flag before the fs is unlocked. */
2946 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 3111 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
@@ -2948,6 +3113,7 @@ static void ext4_unlockfs(struct super_block *sb)
2948 unlock_super(sb); 3113 unlock_super(sb);
2949 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); 3114 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
2950 } 3115 }
3116 return 0;
2951} 3117}
2952 3118
2953static int ext4_remount(struct super_block *sb, int *flags, char *data) 3119static int ext4_remount(struct super_block *sb, int *flags, char *data)
@@ -2958,6 +3124,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
2958 unsigned long old_sb_flags; 3124 unsigned long old_sb_flags;
2959 struct ext4_mount_options old_opts; 3125 struct ext4_mount_options old_opts;
2960 ext4_group_t g; 3126 ext4_group_t g;
3127 unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
2961 int err; 3128 int err;
2962#ifdef CONFIG_QUOTA 3129#ifdef CONFIG_QUOTA
2963 int i; 3130 int i;
@@ -2969,16 +3136,21 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
2969 old_opts.s_resuid = sbi->s_resuid; 3136 old_opts.s_resuid = sbi->s_resuid;
2970 old_opts.s_resgid = sbi->s_resgid; 3137 old_opts.s_resgid = sbi->s_resgid;
2971 old_opts.s_commit_interval = sbi->s_commit_interval; 3138 old_opts.s_commit_interval = sbi->s_commit_interval;
3139 old_opts.s_min_batch_time = sbi->s_min_batch_time;
3140 old_opts.s_max_batch_time = sbi->s_max_batch_time;
2972#ifdef CONFIG_QUOTA 3141#ifdef CONFIG_QUOTA
2973 old_opts.s_jquota_fmt = sbi->s_jquota_fmt; 3142 old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
2974 for (i = 0; i < MAXQUOTAS; i++) 3143 for (i = 0; i < MAXQUOTAS; i++)
2975 old_opts.s_qf_names[i] = sbi->s_qf_names[i]; 3144 old_opts.s_qf_names[i] = sbi->s_qf_names[i];
2976#endif 3145#endif
3146 if (sbi->s_journal && sbi->s_journal->j_task->io_context)
3147 journal_ioprio = sbi->s_journal->j_task->io_context->ioprio;
2977 3148
2978 /* 3149 /*
2979 * Allow the "check" option to be passed as a remount option. 3150 * Allow the "check" option to be passed as a remount option.
2980 */ 3151 */
2981 if (!parse_options(data, sb, NULL, NULL, &n_blocks_count, 1)) { 3152 if (!parse_options(data, sb, NULL, &journal_ioprio,
3153 &n_blocks_count, 1)) {
2982 err = -EINVAL; 3154 err = -EINVAL;
2983 goto restore_opts; 3155 goto restore_opts;
2984 } 3156 }
@@ -2991,7 +3163,10 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
2991 3163
2992 es = sbi->s_es; 3164 es = sbi->s_es;
2993 3165
2994 ext4_init_journal_params(sb, sbi->s_journal); 3166 if (sbi->s_journal) {
3167 ext4_init_journal_params(sb, sbi->s_journal);
3168 set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
3169 }
2995 3170
2996 if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) || 3171 if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) ||
2997 n_blocks_count > ext4_blocks_count(es)) { 3172 n_blocks_count > ext4_blocks_count(es)) {
@@ -3020,17 +3195,20 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3020 * We have to unlock super so that we can wait for 3195 * We have to unlock super so that we can wait for
3021 * transactions. 3196 * transactions.
3022 */ 3197 */
3023 unlock_super(sb); 3198 if (sbi->s_journal) {
3024 ext4_mark_recovery_complete(sb, es); 3199 unlock_super(sb);
3025 lock_super(sb); 3200 ext4_mark_recovery_complete(sb, es);
3201 lock_super(sb);
3202 }
3026 } else { 3203 } else {
3027 __le32 ret; 3204 int ret;
3028 if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb, 3205 if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb,
3029 ~EXT4_FEATURE_RO_COMPAT_SUPP))) { 3206 ~EXT4_FEATURE_RO_COMPAT_SUPP))) {
3030 printk(KERN_WARNING "EXT4-fs: %s: couldn't " 3207 printk(KERN_WARNING "EXT4-fs: %s: couldn't "
3031 "remount RDWR because of unsupported " 3208 "remount RDWR because of unsupported "
3032 "optional features (%x).\n", 3209 "optional features (%x).\n", sb->s_id,
3033 sb->s_id, le32_to_cpu(ret)); 3210 (le32_to_cpu(sbi->s_es->s_feature_ro_compat) &
3211 ~EXT4_FEATURE_RO_COMPAT_SUPP));
3034 err = -EROFS; 3212 err = -EROFS;
3035 goto restore_opts; 3213 goto restore_opts;
3036 } 3214 }
@@ -3047,7 +3225,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3047 if (!ext4_group_desc_csum_verify(sbi, g, gdp)) { 3225 if (!ext4_group_desc_csum_verify(sbi, g, gdp)) {
3048 printk(KERN_ERR 3226 printk(KERN_ERR
3049 "EXT4-fs: ext4_remount: " 3227 "EXT4-fs: ext4_remount: "
3050 "Checksum for group %lu failed (%u!=%u)\n", 3228 "Checksum for group %u failed (%u!=%u)\n",
3051 g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)), 3229 g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)),
3052 le16_to_cpu(gdp->bg_checksum)); 3230 le16_to_cpu(gdp->bg_checksum));
3053 err = -EINVAL; 3231 err = -EINVAL;
@@ -3076,7 +3254,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3076 * been changed by e2fsck since we originally mounted 3254 * been changed by e2fsck since we originally mounted
3077 * the partition.) 3255 * the partition.)
3078 */ 3256 */
3079 ext4_clear_journal_err(sb, es); 3257 if (sbi->s_journal)
3258 ext4_clear_journal_err(sb, es);
3080 sbi->s_mount_state = le16_to_cpu(es->s_state); 3259 sbi->s_mount_state = le16_to_cpu(es->s_state);
3081 if ((err = ext4_group_extend(sb, es, n_blocks_count))) 3260 if ((err = ext4_group_extend(sb, es, n_blocks_count)))
3082 goto restore_opts; 3261 goto restore_opts;
@@ -3084,6 +3263,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3084 sb->s_flags &= ~MS_RDONLY; 3263 sb->s_flags &= ~MS_RDONLY;
3085 } 3264 }
3086 } 3265 }
3266 if (sbi->s_journal == NULL)
3267 ext4_commit_super(sb, es, 1);
3268
3087#ifdef CONFIG_QUOTA 3269#ifdef CONFIG_QUOTA
3088 /* Release old quota file names */ 3270 /* Release old quota file names */
3089 for (i = 0; i < MAXQUOTAS; i++) 3271 for (i = 0; i < MAXQUOTAS; i++)
@@ -3098,6 +3280,8 @@ restore_opts:
3098 sbi->s_resuid = old_opts.s_resuid; 3280 sbi->s_resuid = old_opts.s_resuid;
3099 sbi->s_resgid = old_opts.s_resgid; 3281 sbi->s_resgid = old_opts.s_resgid;
3100 sbi->s_commit_interval = old_opts.s_commit_interval; 3282 sbi->s_commit_interval = old_opts.s_commit_interval;
3283 sbi->s_min_batch_time = old_opts.s_min_batch_time;
3284 sbi->s_max_batch_time = old_opts.s_max_batch_time;
3101#ifdef CONFIG_QUOTA 3285#ifdef CONFIG_QUOTA
3102 sbi->s_jquota_fmt = old_opts.s_jquota_fmt; 3286 sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
3103 for (i = 0; i < MAXQUOTAS; i++) { 3287 for (i = 0; i < MAXQUOTAS; i++) {
@@ -3360,7 +3544,8 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
3360 * When we journal data on quota file, we have to flush journal to see 3544 * When we journal data on quota file, we have to flush journal to see
3361 * all updates to the file when we bypass pagecache... 3545 * all updates to the file when we bypass pagecache...
3362 */ 3546 */
3363 if (ext4_should_journal_data(path.dentry->d_inode)) { 3547 if (EXT4_SB(sb)->s_journal &&
3548 ext4_should_journal_data(path.dentry->d_inode)) {
3364 /* 3549 /*
3365 * We don't need to lock updates but journal_flush() could 3550 * We don't need to lock updates but journal_flush() could
3366 * otherwise be livelocked... 3551 * otherwise be livelocked...
@@ -3434,7 +3619,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
3434 struct buffer_head *bh; 3619 struct buffer_head *bh;
3435 handle_t *handle = journal_current_handle(); 3620 handle_t *handle = journal_current_handle();
3436 3621
3437 if (!handle) { 3622 if (EXT4_SB(sb)->s_journal && !handle) {
3438 printk(KERN_WARNING "EXT4-fs: Quota write (off=%llu, len=%llu)" 3623 printk(KERN_WARNING "EXT4-fs: Quota write (off=%llu, len=%llu)"
3439 " cancelled because transaction is not started.\n", 3624 " cancelled because transaction is not started.\n",
3440 (unsigned long long)off, (unsigned long long)len); 3625 (unsigned long long)off, (unsigned long long)len);
@@ -3459,7 +3644,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
3459 flush_dcache_page(bh->b_page); 3644 flush_dcache_page(bh->b_page);
3460 unlock_buffer(bh); 3645 unlock_buffer(bh);
3461 if (journal_quota) 3646 if (journal_quota)
3462 err = ext4_journal_dirty_metadata(handle, bh); 3647 err = ext4_handle_dirty_metadata(handle, NULL, bh);
3463 else { 3648 else {
3464 /* Always do at least ordered writes for quotas */ 3649 /* Always do at least ordered writes for quotas */
3465 err = ext4_jbd2_file_inode(handle, inode); 3650 err = ext4_jbd2_file_inode(handle, inode);
@@ -3513,18 +3698,15 @@ static int ext4_ui_proc_open(struct inode *inode, struct file *file)
3513static ssize_t ext4_ui_proc_write(struct file *file, const char __user *buf, 3698static ssize_t ext4_ui_proc_write(struct file *file, const char __user *buf,
3514 size_t cnt, loff_t *ppos) 3699 size_t cnt, loff_t *ppos)
3515{ 3700{
3516 unsigned int *p = PDE(file->f_path.dentry->d_inode)->data; 3701 unsigned long *p = PDE(file->f_path.dentry->d_inode)->data;
3517 char str[32]; 3702 char str[32];
3518 unsigned long value;
3519 3703
3520 if (cnt >= sizeof(str)) 3704 if (cnt >= sizeof(str))
3521 return -EINVAL; 3705 return -EINVAL;
3522 if (copy_from_user(str, buf, cnt)) 3706 if (copy_from_user(str, buf, cnt))
3523 return -EFAULT; 3707 return -EFAULT;
3524 value = simple_strtol(str, NULL, 0); 3708
3525 if (value < 0) 3709 *p = simple_strtoul(str, NULL, 0);
3526 return -ERANGE;
3527 *p = value;
3528 return cnt; 3710 return cnt;
3529} 3711}
3530 3712
@@ -3615,7 +3797,7 @@ static void __exit exit_ext4_fs(void)
3615} 3797}
3616 3798
3617MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); 3799MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
3618MODULE_DESCRIPTION("Fourth Extended Filesystem with extents"); 3800MODULE_DESCRIPTION("Fourth Extended Filesystem");
3619MODULE_LICENSE("GPL"); 3801MODULE_LICENSE("GPL");
3620module_init(init_ext4_fs) 3802module_init(init_ext4_fs)
3621module_exit(exit_ext4_fs) 3803module_exit(exit_ext4_fs)
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 80626d516fee..157ce6589c54 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -457,7 +457,7 @@ static void ext4_xattr_update_super_block(handle_t *handle,
457 if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) { 457 if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) {
458 EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR); 458 EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR);
459 sb->s_dirt = 1; 459 sb->s_dirt = 1;
460 ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh); 460 ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
461 } 461 }
462} 462}
463 463
@@ -487,9 +487,9 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
487 ext4_forget(handle, 1, inode, bh, bh->b_blocknr); 487 ext4_forget(handle, 1, inode, bh, bh->b_blocknr);
488 } else { 488 } else {
489 le32_add_cpu(&BHDR(bh)->h_refcount, -1); 489 le32_add_cpu(&BHDR(bh)->h_refcount, -1);
490 error = ext4_journal_dirty_metadata(handle, bh); 490 error = ext4_handle_dirty_metadata(handle, inode, bh);
491 if (IS_SYNC(inode)) 491 if (IS_SYNC(inode))
492 handle->h_sync = 1; 492 ext4_handle_sync(handle);
493 DQUOT_FREE_BLOCK(inode, 1); 493 DQUOT_FREE_BLOCK(inode, 1);
494 ea_bdebug(bh, "refcount now=%d; releasing", 494 ea_bdebug(bh, "refcount now=%d; releasing",
495 le32_to_cpu(BHDR(bh)->h_refcount)); 495 le32_to_cpu(BHDR(bh)->h_refcount));
@@ -724,8 +724,9 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
724 if (error == -EIO) 724 if (error == -EIO)
725 goto bad_block; 725 goto bad_block;
726 if (!error) 726 if (!error)
727 error = ext4_journal_dirty_metadata(handle, 727 error = ext4_handle_dirty_metadata(handle,
728 bs->bh); 728 inode,
729 bs->bh);
729 if (error) 730 if (error)
730 goto cleanup; 731 goto cleanup;
731 goto inserted; 732 goto inserted;
@@ -794,8 +795,9 @@ inserted:
794 ea_bdebug(new_bh, "reusing; refcount now=%d", 795 ea_bdebug(new_bh, "reusing; refcount now=%d",
795 le32_to_cpu(BHDR(new_bh)->h_refcount)); 796 le32_to_cpu(BHDR(new_bh)->h_refcount));
796 unlock_buffer(new_bh); 797 unlock_buffer(new_bh);
797 error = ext4_journal_dirty_metadata(handle, 798 error = ext4_handle_dirty_metadata(handle,
798 new_bh); 799 inode,
800 new_bh);
799 if (error) 801 if (error)
800 goto cleanup_dquot; 802 goto cleanup_dquot;
801 } 803 }
@@ -810,8 +812,8 @@ inserted:
810 /* We need to allocate a new block */ 812 /* We need to allocate a new block */
811 ext4_fsblk_t goal = ext4_group_first_block_no(sb, 813 ext4_fsblk_t goal = ext4_group_first_block_no(sb,
812 EXT4_I(inode)->i_block_group); 814 EXT4_I(inode)->i_block_group);
813 ext4_fsblk_t block = ext4_new_meta_block(handle, inode, 815 ext4_fsblk_t block = ext4_new_meta_blocks(handle, inode,
814 goal, &error); 816 goal, NULL, &error);
815 if (error) 817 if (error)
816 goto cleanup; 818 goto cleanup;
817 ea_idebug(inode, "creating block %d", block); 819 ea_idebug(inode, "creating block %d", block);
@@ -833,7 +835,8 @@ getblk_failed:
833 set_buffer_uptodate(new_bh); 835 set_buffer_uptodate(new_bh);
834 unlock_buffer(new_bh); 836 unlock_buffer(new_bh);
835 ext4_xattr_cache_insert(new_bh); 837 ext4_xattr_cache_insert(new_bh);
836 error = ext4_journal_dirty_metadata(handle, new_bh); 838 error = ext4_handle_dirty_metadata(handle,
839 inode, new_bh);
837 if (error) 840 if (error)
838 goto cleanup; 841 goto cleanup;
839 } 842 }
@@ -1040,7 +1043,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
1040 */ 1043 */
1041 is.iloc.bh = NULL; 1044 is.iloc.bh = NULL;
1042 if (IS_SYNC(inode)) 1045 if (IS_SYNC(inode))
1043 handle->h_sync = 1; 1046 ext4_handle_sync(handle);
1044 } 1047 }
1045 1048
1046cleanup: 1049cleanup:
diff --git a/fs/fat/Kconfig b/fs/fat/Kconfig
new file mode 100644
index 000000000000..d0a69ff25375
--- /dev/null
+++ b/fs/fat/Kconfig
@@ -0,0 +1,97 @@
1config FAT_FS
2 tristate
3 select NLS
4 help
5 If you want to use one of the FAT-based file systems (the MS-DOS and
6 VFAT (Windows 95) file systems), then you must say Y or M here
7 to include FAT support. You will then be able to mount partitions or
8 diskettes with FAT-based file systems and transparently access the
9 files on them, i.e. MSDOS files will look and behave just like all
10 other Unix files.
11
12 This FAT support is not a file system in itself, it only provides
13 the foundation for the other file systems. You will have to say Y or
14 M to at least one of "MSDOS fs support" or "VFAT fs support" in
15 order to make use of it.
16
17 Another way to read and write MSDOS floppies and hard drive
18 partitions from within Linux (but not transparently) is with the
19 mtools ("man mtools") program suite. You don't need to say Y here in
20 order to do that.
21
22 If you need to move large files on floppies between a DOS and a
23 Linux box, say Y here, mount the floppy under Linux with an MSDOS
24 file system and use GNU tar's M option. GNU tar is a program
25 available for Unix and DOS ("man tar" or "info tar").
26
27 The FAT support will enlarge your kernel by about 37 KB. If unsure,
28 say Y.
29
30 To compile this as a module, choose M here: the module will be called
31 fat. Note that if you compile the FAT support as a module, you
32 cannot compile any of the FAT-based file systems into the kernel
33 -- they will have to be modules as well.
34
35config MSDOS_FS
36 tristate "MSDOS fs support"
37 select FAT_FS
38 help
39 This allows you to mount MSDOS partitions of your hard drive (unless
40 they are compressed; to access compressed MSDOS partitions under
41 Linux, you can either use the DOS emulator DOSEMU, described in the
42 DOSEMU-HOWTO, available from
43 <http://www.tldp.org/docs.html#howto>, or try dmsdosfs in
44 <ftp://ibiblio.org/pub/Linux/system/filesystems/dosfs/>. If you
45 intend to use dosemu with a non-compressed MSDOS partition, say Y
46 here) and MSDOS floppies. This means that file access becomes
47 transparent, i.e. the MSDOS files look and behave just like all
48 other Unix files.
49
50 If you have Windows 95 or Windows NT installed on your MSDOS
51 partitions, you should use the VFAT file system (say Y to "VFAT fs
52 support" below), or you will not be able to see the long filenames
53 generated by Windows 95 / Windows NT.
54
55 This option will enlarge your kernel by about 7 KB. If unsure,
56 answer Y. This will only work if you said Y to "DOS FAT fs support"
57 as well. To compile this as a module, choose M here: the module will
58 be called msdos.
59
60config VFAT_FS
61 tristate "VFAT (Windows-95) fs support"
62 select FAT_FS
63 help
64 This option provides support for normal Windows file systems with
65 long filenames. That includes non-compressed FAT-based file systems
66 used by Windows 95, Windows 98, Windows NT 4.0, and the Unix
67 programs from the mtools package.
68
69 The VFAT support enlarges your kernel by about 10 KB and it only
70 works if you said Y to the "DOS FAT fs support" above. Please read
71 the file <file:Documentation/filesystems/vfat.txt> for details. If
72 unsure, say Y.
73
74 To compile this as a module, choose M here: the module will be called
75 vfat.
76
77config FAT_DEFAULT_CODEPAGE
78 int "Default codepage for FAT"
79 depends on MSDOS_FS || VFAT_FS
80 default 437
81 help
82 This option should be set to the codepage of your FAT filesystems.
83 It can be overridden with the "codepage" mount option.
84 See <file:Documentation/filesystems/vfat.txt> for more information.
85
86config FAT_DEFAULT_IOCHARSET
87 string "Default iocharset for FAT"
88 depends on VFAT_FS
89 default "iso8859-1"
90 help
91 Set this to the default input/output character set you'd
92 like FAT to use. It should probably match the character set
93 that most of your FAT filesystems use, and can be overridden
94 with the "iocharset" mount option for FAT filesystems.
95 Note that "utf8" is not recommended for FAT filesystems.
96 If unsure, you shouldn't set "utf8" here.
97 See <file:Documentation/filesystems/vfat.txt> for more information.
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 67e058357098..3a7f603b6982 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -841,7 +841,6 @@ const struct file_operations fat_dir_operations = {
841 .compat_ioctl = fat_compat_dir_ioctl, 841 .compat_ioctl = fat_compat_dir_ioctl,
842#endif 842#endif
843 .fsync = file_fsync, 843 .fsync = file_fsync,
844 .llseek = generic_file_llseek,
845}; 844};
846 845
847static int fat_get_short_entry(struct inode *dir, loff_t *pos, 846static int fat_get_short_entry(struct inode *dir, loff_t *pos,
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index d937aaf77374..6b74d09adbe5 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -749,6 +749,8 @@ static struct dentry *fat_get_parent(struct dentry *child)
749 brelse(bh); 749 brelse(bh);
750 750
751 parent = d_obtain_alias(inode); 751 parent = d_obtain_alias(inode);
752 if (!IS_ERR(parent))
753 parent->d_op = sb->s_root->d_op;
752out: 754out:
753 unlock_super(sb); 755 unlock_super(sb);
754 756
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index bf326d4356a3..8ae32e37673c 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -78,7 +78,7 @@ static int vfat_revalidate_ci(struct dentry *dentry, struct nameidata *nd)
78 * for creation. 78 * for creation.
79 */ 79 */
80 if (!(nd->flags & (LOOKUP_CONTINUE | LOOKUP_PARENT))) { 80 if (!(nd->flags & (LOOKUP_CONTINUE | LOOKUP_PARENT))) {
81 if (nd->flags & LOOKUP_CREATE) 81 if (nd->flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET))
82 return 0; 82 return 0;
83 } 83 }
84 84
diff --git a/fs/fcntl.c b/fs/fcntl.c
index cdc141946724..bd215cc791da 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -50,7 +50,7 @@ static int get_close_on_exec(unsigned int fd)
50 return res; 50 return res;
51} 51}
52 52
53asmlinkage long sys_dup3(unsigned int oldfd, unsigned int newfd, int flags) 53SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags)
54{ 54{
55 int err = -EBADF; 55 int err = -EBADF;
56 struct file * file, *tofree; 56 struct file * file, *tofree;
@@ -113,7 +113,7 @@ out_unlock:
113 return err; 113 return err;
114} 114}
115 115
116asmlinkage long sys_dup2(unsigned int oldfd, unsigned int newfd) 116SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
117{ 117{
118 if (unlikely(newfd == oldfd)) { /* corner case */ 118 if (unlikely(newfd == oldfd)) { /* corner case */
119 struct files_struct *files = current->files; 119 struct files_struct *files = current->files;
@@ -126,7 +126,7 @@ asmlinkage long sys_dup2(unsigned int oldfd, unsigned int newfd)
126 return sys_dup3(oldfd, newfd, 0); 126 return sys_dup3(oldfd, newfd, 0);
127} 127}
128 128
129asmlinkage long sys_dup(unsigned int fildes) 129SYSCALL_DEFINE1(dup, unsigned int, fildes)
130{ 130{
131 int ret = -EBADF; 131 int ret = -EBADF;
132 struct file *file = fget(fildes); 132 struct file *file = fget(fildes);
@@ -335,7 +335,7 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
335 return err; 335 return err;
336} 336}
337 337
338asmlinkage long sys_fcntl(unsigned int fd, unsigned int cmd, unsigned long arg) 338SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
339{ 339{
340 struct file *filp; 340 struct file *filp;
341 long err = -EBADF; 341 long err = -EBADF;
@@ -358,7 +358,8 @@ out:
358} 358}
359 359
360#if BITS_PER_LONG == 32 360#if BITS_PER_LONG == 32
361asmlinkage long sys_fcntl64(unsigned int fd, unsigned int cmd, unsigned long arg) 361SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
362 unsigned long, arg)
362{ 363{
363 struct file * filp; 364 struct file * filp;
364 long err; 365 long err;
diff --git a/fs/file_table.c b/fs/file_table.c
index 55895ccc08c6..da806aceae3f 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -33,6 +33,9 @@ struct files_stat_struct files_stat = {
33/* public. Not pretty! */ 33/* public. Not pretty! */
34__cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock); 34__cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock);
35 35
36/* SLAB cache for file structures */
37static struct kmem_cache *filp_cachep __read_mostly;
38
36static struct percpu_counter nr_files __cacheline_aligned_in_smp; 39static struct percpu_counter nr_files __cacheline_aligned_in_smp;
37 40
38static inline void file_free_rcu(struct rcu_head *head) 41static inline void file_free_rcu(struct rcu_head *head)
@@ -399,7 +402,12 @@ too_bad:
399void __init files_init(unsigned long mempages) 402void __init files_init(unsigned long mempages)
400{ 403{
401 int n; 404 int n;
402 /* One file with associated inode and dcache is very roughly 1K. 405
406 filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
407 SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
408
409 /*
410 * One file with associated inode and dcache is very roughly 1K.
403 * Per default don't use more than 10% of our memory for files. 411 * Per default don't use more than 10% of our memory for files.
404 */ 412 */
405 413
diff --git a/fs/filesystems.c b/fs/filesystems.c
index d0e20ced62dd..1aa70260e6d1 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -179,7 +179,7 @@ static int fs_maxindex(void)
179/* 179/*
180 * Whee.. Weird sysv syscall. 180 * Whee.. Weird sysv syscall.
181 */ 181 */
182asmlinkage long sys_sysfs(int option, unsigned long arg1, unsigned long arg2) 182SYSCALL_DEFINE3(sysfs, int, option, unsigned long, arg1, unsigned long, arg2)
183{ 183{
184 int retval = -EINVAL; 184 int retval = -EINVAL;
185 185
@@ -253,24 +253,27 @@ static int __init proc_filesystems_init(void)
253module_init(proc_filesystems_init); 253module_init(proc_filesystems_init);
254#endif 254#endif
255 255
256struct file_system_type *get_fs_type(const char *name) 256static struct file_system_type *__get_fs_type(const char *name, int len)
257{ 257{
258 struct file_system_type *fs; 258 struct file_system_type *fs;
259 const char *dot = strchr(name, '.');
260 unsigned len = dot ? dot - name : strlen(name);
261 259
262 read_lock(&file_systems_lock); 260 read_lock(&file_systems_lock);
263 fs = *(find_filesystem(name, len)); 261 fs = *(find_filesystem(name, len));
264 if (fs && !try_module_get(fs->owner)) 262 if (fs && !try_module_get(fs->owner))
265 fs = NULL; 263 fs = NULL;
266 read_unlock(&file_systems_lock); 264 read_unlock(&file_systems_lock);
267 if (!fs && (request_module("%.*s", len, name) == 0)) { 265 return fs;
268 read_lock(&file_systems_lock); 266}
269 fs = *(find_filesystem(name, len)); 267
270 if (fs && !try_module_get(fs->owner)) 268struct file_system_type *get_fs_type(const char *name)
271 fs = NULL; 269{
272 read_unlock(&file_systems_lock); 270 struct file_system_type *fs;
273 } 271 const char *dot = strchr(name, '.');
272 int len = dot ? dot - name : strlen(name);
273
274 fs = __get_fs_type(name, len);
275 if (!fs && (request_module("%.*s", len, name) == 0))
276 fs = __get_fs_type(name, len);
274 277
275 if (dot && fs && !(fs->fs_flags & FS_HAS_SUBTYPE)) { 278 if (dot && fs && !(fs->fs_flags & FS_HAS_SUBTYPE)) {
276 put_filesystem(fs); 279 put_filesystem(fs);
diff --git a/fs/freevxfs/Kconfig b/fs/freevxfs/Kconfig
new file mode 100644
index 000000000000..8dc1cd5c1efe
--- /dev/null
+++ b/fs/freevxfs/Kconfig
@@ -0,0 +1,16 @@
1config VXFS_FS
2 tristate "FreeVxFS file system support (VERITAS VxFS(TM) compatible)"
3 depends on BLOCK
4 help
5 FreeVxFS is a file system driver that support the VERITAS VxFS(TM)
6 file system format. VERITAS VxFS(TM) is the standard file system
7 of SCO UnixWare (and possibly others) and optionally available
8 for Sunsoft Solaris, HP-UX and many other operating systems.
9 Currently only readonly access is supported.
10
11 NOTE: the file system type as used by mount(1), mount(2) and
12 fstab(5) is 'vxfs' as it describes the file system format, not
13 the actual driver.
14
15 To compile this as a module, choose M here: the module will be
16 called freevxfs. If unsure, say N.
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index 9f3f2ceb73f0..03a6ea5e99f7 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -325,8 +325,10 @@ vxfs_iget(struct super_block *sbp, ino_t ino)
325 if (!VXFS_ISIMMED(vip)) { 325 if (!VXFS_ISIMMED(vip)) {
326 ip->i_op = &page_symlink_inode_operations; 326 ip->i_op = &page_symlink_inode_operations;
327 ip->i_mapping->a_ops = &vxfs_aops; 327 ip->i_mapping->a_ops = &vxfs_aops;
328 } else 328 } else {
329 ip->i_op = &vxfs_immed_symlink_iops; 329 ip->i_op = &vxfs_immed_symlink_iops;
330 vip->vii_immed.vi_immed[ip->i_size] = '\0';
331 }
330 } else 332 } else
331 init_special_inode(ip, ip->i_mode, old_decode_dev(vip->vii_rdev)); 333 init_special_inode(ip, ip->i_mode, old_decode_dev(vip->vii_rdev));
332 334
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index d0ff0b8cf309..e5eaa62fd17f 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -421,9 +421,6 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
421 * If we're a pdlfush thread, then implement pdflush collision avoidance 421 * If we're a pdlfush thread, then implement pdflush collision avoidance
422 * against the entire list. 422 * against the entire list.
423 * 423 *
424 * WB_SYNC_HOLD is a hack for sys_sync(): reattach the inode to sb->s_dirty so
425 * that it can be located for waiting on in __writeback_single_inode().
426 *
427 * If `bdi' is non-zero then we're being asked to writeback a specific queue. 424 * If `bdi' is non-zero then we're being asked to writeback a specific queue.
428 * This function assumes that the blockdev superblock's inodes are backed by 425 * This function assumes that the blockdev superblock's inodes are backed by
429 * a variety of queues, so all inodes are searched. For other superblocks, 426 * a variety of queues, so all inodes are searched. For other superblocks,
@@ -443,6 +440,7 @@ void generic_sync_sb_inodes(struct super_block *sb,
443 struct writeback_control *wbc) 440 struct writeback_control *wbc)
444{ 441{
445 const unsigned long start = jiffies; /* livelock avoidance */ 442 const unsigned long start = jiffies; /* livelock avoidance */
443 int sync = wbc->sync_mode == WB_SYNC_ALL;
446 444
447 spin_lock(&inode_lock); 445 spin_lock(&inode_lock);
448 if (!wbc->for_kupdate || list_empty(&sb->s_io)) 446 if (!wbc->for_kupdate || list_empty(&sb->s_io))
@@ -499,10 +497,6 @@ void generic_sync_sb_inodes(struct super_block *sb,
499 __iget(inode); 497 __iget(inode);
500 pages_skipped = wbc->pages_skipped; 498 pages_skipped = wbc->pages_skipped;
501 __writeback_single_inode(inode, wbc); 499 __writeback_single_inode(inode, wbc);
502 if (wbc->sync_mode == WB_SYNC_HOLD) {
503 inode->dirtied_when = jiffies;
504 list_move(&inode->i_list, &sb->s_dirty);
505 }
506 if (current_is_pdflush()) 500 if (current_is_pdflush())
507 writeback_release(bdi); 501 writeback_release(bdi);
508 if (wbc->pages_skipped != pages_skipped) { 502 if (wbc->pages_skipped != pages_skipped) {
@@ -523,7 +517,49 @@ void generic_sync_sb_inodes(struct super_block *sb,
523 if (!list_empty(&sb->s_more_io)) 517 if (!list_empty(&sb->s_more_io))
524 wbc->more_io = 1; 518 wbc->more_io = 1;
525 } 519 }
526 spin_unlock(&inode_lock); 520
521 if (sync) {
522 struct inode *inode, *old_inode = NULL;
523
524 /*
525 * Data integrity sync. Must wait for all pages under writeback,
526 * because there may have been pages dirtied before our sync
527 * call, but which had writeout started before we write it out.
528 * In which case, the inode may not be on the dirty list, but
529 * we still have to wait for that writeout.
530 */
531 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
532 struct address_space *mapping;
533
534 if (inode->i_state & (I_FREEING|I_WILL_FREE))
535 continue;
536 mapping = inode->i_mapping;
537 if (mapping->nrpages == 0)
538 continue;
539 __iget(inode);
540 spin_unlock(&inode_lock);
541 /*
542 * We hold a reference to 'inode' so it couldn't have
543 * been removed from s_inodes list while we dropped the
544 * inode_lock. We cannot iput the inode now as we can
545 * be holding the last reference and we cannot iput it
546 * under inode_lock. So we keep the reference and iput
547 * it later.
548 */
549 iput(old_inode);
550 old_inode = inode;
551
552 filemap_fdatawait(mapping);
553
554 cond_resched();
555
556 spin_lock(&inode_lock);
557 }
558 spin_unlock(&inode_lock);
559 iput(old_inode);
560 } else
561 spin_unlock(&inode_lock);
562
527 return; /* Leave any unwritten inodes on s_io */ 563 return; /* Leave any unwritten inodes on s_io */
528} 564}
529EXPORT_SYMBOL_GPL(generic_sync_sb_inodes); 565EXPORT_SYMBOL_GPL(generic_sync_sb_inodes);
@@ -588,8 +624,7 @@ restart:
588 624
589/* 625/*
590 * writeback and wait upon the filesystem's dirty inodes. The caller will 626 * writeback and wait upon the filesystem's dirty inodes. The caller will
591 * do this in two passes - one to write, and one to wait. WB_SYNC_HOLD is 627 * do this in two passes - one to write, and one to wait.
592 * used to park the written inodes on sb->s_dirty for the wait pass.
593 * 628 *
594 * A finite limit is set on the number of pages which will be written. 629 * A finite limit is set on the number of pages which will be written.
595 * To prevent infinite livelock of sys_sync(). 630 * To prevent infinite livelock of sys_sync().
@@ -600,30 +635,21 @@ restart:
600void sync_inodes_sb(struct super_block *sb, int wait) 635void sync_inodes_sb(struct super_block *sb, int wait)
601{ 636{
602 struct writeback_control wbc = { 637 struct writeback_control wbc = {
603 .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_HOLD, 638 .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
604 .range_start = 0, 639 .range_start = 0,
605 .range_end = LLONG_MAX, 640 .range_end = LLONG_MAX,
606 }; 641 };
607 unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
608 unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
609 642
610 wbc.nr_to_write = nr_dirty + nr_unstable + 643 if (!wait) {
611 (inodes_stat.nr_inodes - inodes_stat.nr_unused) + 644 unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
612 nr_dirty + nr_unstable; 645 unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
613 wbc.nr_to_write += wbc.nr_to_write / 2; /* Bit more for luck */
614 sync_sb_inodes(sb, &wbc);
615}
616 646
617/* 647 wbc.nr_to_write = nr_dirty + nr_unstable +
618 * Rather lame livelock avoidance. 648 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
619 */ 649 } else
620static void set_sb_syncing(int val) 650 wbc.nr_to_write = LONG_MAX; /* doesn't actually matter */
621{ 651
622 struct super_block *sb; 652 sync_sb_inodes(sb, &wbc);
623 spin_lock(&sb_lock);
624 list_for_each_entry_reverse(sb, &super_blocks, s_list)
625 sb->s_syncing = val;
626 spin_unlock(&sb_lock);
627} 653}
628 654
629/** 655/**
@@ -652,9 +678,6 @@ static void __sync_inodes(int wait)
652 spin_lock(&sb_lock); 678 spin_lock(&sb_lock);
653restart: 679restart:
654 list_for_each_entry(sb, &super_blocks, s_list) { 680 list_for_each_entry(sb, &super_blocks, s_list) {
655 if (sb->s_syncing)
656 continue;
657 sb->s_syncing = 1;
658 sb->s_count++; 681 sb->s_count++;
659 spin_unlock(&sb_lock); 682 spin_unlock(&sb_lock);
660 down_read(&sb->s_umount); 683 down_read(&sb->s_umount);
@@ -672,13 +695,10 @@ restart:
672 695
673void sync_inodes(int wait) 696void sync_inodes(int wait)
674{ 697{
675 set_sb_syncing(0);
676 __sync_inodes(0); 698 __sync_inodes(0);
677 699
678 if (wait) { 700 if (wait)
679 set_sb_syncing(0);
680 __sync_inodes(1); 701 __sync_inodes(1);
681 }
682} 702}
683 703
684/** 704/**
diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig
new file mode 100644
index 000000000000..0cf160a94eda
--- /dev/null
+++ b/fs/fuse/Kconfig
@@ -0,0 +1,15 @@
1config FUSE_FS
2 tristate "FUSE (Filesystem in Userspace) support"
3 help
4 With FUSE it is possible to implement a fully functional filesystem
5 in a userspace program.
6
7 There's also companion library: libfuse. This library along with
8 utilities is available from the FUSE homepage:
9 <http://fuse.sourceforge.net/>
10
11 See <file:Documentation/filesystems/fuse.txt> for more information.
12 See <file:Documentation/Changes> for needed library/utility version.
13
14 If you want to develop a userspace FS, or if you want to use
15 a filesystem based on FUSE, answer Y or M.
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 4f3cab321415..99c99dfb0373 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -1,6 +1,6 @@
1/* 1/*
2 FUSE: Filesystem in Userspace 2 FUSE: Filesystem in Userspace
3 Copyright (C) 2001-2006 Miklos Szeredi <miklos@szeredi.hu> 3 Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu>
4 4
5 This program can be distributed under the terms of the GNU GPL. 5 This program can be distributed under the terms of the GNU GPL.
6 See the file COPYING. 6 See the file COPYING.
@@ -48,11 +48,13 @@ static ssize_t fuse_conn_waiting_read(struct file *file, char __user *buf,
48 size_t size; 48 size_t size;
49 49
50 if (!*ppos) { 50 if (!*ppos) {
51 long value;
51 struct fuse_conn *fc = fuse_ctl_file_conn_get(file); 52 struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
52 if (!fc) 53 if (!fc)
53 return 0; 54 return 0;
54 55
55 file->private_data=(void *)(long)atomic_read(&fc->num_waiting); 56 value = atomic_read(&fc->num_waiting);
57 file->private_data = (void *)value;
56 fuse_conn_put(fc); 58 fuse_conn_put(fc);
57 } 59 }
58 size = sprintf(tmp, "%ld\n", (long)file->private_data); 60 size = sprintf(tmp, "%ld\n", (long)file->private_data);
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index fba571648a8e..ba76b68c52ff 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1,6 +1,6 @@
1/* 1/*
2 FUSE: Filesystem in Userspace 2 FUSE: Filesystem in Userspace
3 Copyright (C) 2001-2006 Miklos Szeredi <miklos@szeredi.hu> 3 Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu>
4 4
5 This program can be distributed under the terms of the GNU GPL. 5 This program can be distributed under the terms of the GNU GPL.
6 See the file COPYING. 6 See the file COPYING.
@@ -269,7 +269,7 @@ static void flush_bg_queue(struct fuse_conn *fc)
269 * Called with fc->lock, unlocks it 269 * Called with fc->lock, unlocks it
270 */ 270 */
271static void request_end(struct fuse_conn *fc, struct fuse_req *req) 271static void request_end(struct fuse_conn *fc, struct fuse_req *req)
272 __releases(fc->lock) 272__releases(&fc->lock)
273{ 273{
274 void (*end) (struct fuse_conn *, struct fuse_req *) = req->end; 274 void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
275 req->end = NULL; 275 req->end = NULL;
@@ -281,7 +281,8 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req)
281 fc->blocked = 0; 281 fc->blocked = 0;
282 wake_up_all(&fc->blocked_waitq); 282 wake_up_all(&fc->blocked_waitq);
283 } 283 }
284 if (fc->num_background == FUSE_CONGESTION_THRESHOLD) { 284 if (fc->num_background == FUSE_CONGESTION_THRESHOLD &&
285 fc->connected) {
285 clear_bdi_congested(&fc->bdi, READ); 286 clear_bdi_congested(&fc->bdi, READ);
286 clear_bdi_congested(&fc->bdi, WRITE); 287 clear_bdi_congested(&fc->bdi, WRITE);
287 } 288 }
@@ -293,13 +294,13 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req)
293 wake_up(&req->waitq); 294 wake_up(&req->waitq);
294 if (end) 295 if (end)
295 end(fc, req); 296 end(fc, req);
296 else 297 fuse_put_request(fc, req);
297 fuse_put_request(fc, req);
298} 298}
299 299
300static void wait_answer_interruptible(struct fuse_conn *fc, 300static void wait_answer_interruptible(struct fuse_conn *fc,
301 struct fuse_req *req) 301 struct fuse_req *req)
302 __releases(fc->lock) __acquires(fc->lock) 302__releases(&fc->lock)
303__acquires(&fc->lock)
303{ 304{
304 if (signal_pending(current)) 305 if (signal_pending(current))
305 return; 306 return;
@@ -317,7 +318,8 @@ static void queue_interrupt(struct fuse_conn *fc, struct fuse_req *req)
317} 318}
318 319
319static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req) 320static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
320 __releases(fc->lock) __acquires(fc->lock) 321__releases(&fc->lock)
322__acquires(&fc->lock)
321{ 323{
322 if (!fc->no_interrupt) { 324 if (!fc->no_interrupt) {
323 /* Any signal may interrupt this */ 325 /* Any signal may interrupt this */
@@ -380,7 +382,7 @@ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
380 } 382 }
381} 383}
382 384
383void request_send(struct fuse_conn *fc, struct fuse_req *req) 385void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
384{ 386{
385 req->isreply = 1; 387 req->isreply = 1;
386 spin_lock(&fc->lock); 388 spin_lock(&fc->lock);
@@ -399,8 +401,8 @@ void request_send(struct fuse_conn *fc, struct fuse_req *req)
399 spin_unlock(&fc->lock); 401 spin_unlock(&fc->lock);
400} 402}
401 403
402static void request_send_nowait_locked(struct fuse_conn *fc, 404static void fuse_request_send_nowait_locked(struct fuse_conn *fc,
403 struct fuse_req *req) 405 struct fuse_req *req)
404{ 406{
405 req->background = 1; 407 req->background = 1;
406 fc->num_background++; 408 fc->num_background++;
@@ -414,11 +416,11 @@ static void request_send_nowait_locked(struct fuse_conn *fc,
414 flush_bg_queue(fc); 416 flush_bg_queue(fc);
415} 417}
416 418
417static void request_send_nowait(struct fuse_conn *fc, struct fuse_req *req) 419static void fuse_request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
418{ 420{
419 spin_lock(&fc->lock); 421 spin_lock(&fc->lock);
420 if (fc->connected) { 422 if (fc->connected) {
421 request_send_nowait_locked(fc, req); 423 fuse_request_send_nowait_locked(fc, req);
422 spin_unlock(&fc->lock); 424 spin_unlock(&fc->lock);
423 } else { 425 } else {
424 req->out.h.error = -ENOTCONN; 426 req->out.h.error = -ENOTCONN;
@@ -426,16 +428,16 @@ static void request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
426 } 428 }
427} 429}
428 430
429void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req) 431void fuse_request_send_noreply(struct fuse_conn *fc, struct fuse_req *req)
430{ 432{
431 req->isreply = 0; 433 req->isreply = 0;
432 request_send_nowait(fc, req); 434 fuse_request_send_nowait(fc, req);
433} 435}
434 436
435void request_send_background(struct fuse_conn *fc, struct fuse_req *req) 437void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req)
436{ 438{
437 req->isreply = 1; 439 req->isreply = 1;
438 request_send_nowait(fc, req); 440 fuse_request_send_nowait(fc, req);
439} 441}
440 442
441/* 443/*
@@ -443,10 +445,11 @@ void request_send_background(struct fuse_conn *fc, struct fuse_req *req)
443 * 445 *
444 * fc->connected must have been checked previously 446 * fc->connected must have been checked previously
445 */ 447 */
446void request_send_background_locked(struct fuse_conn *fc, struct fuse_req *req) 448void fuse_request_send_background_locked(struct fuse_conn *fc,
449 struct fuse_req *req)
447{ 450{
448 req->isreply = 1; 451 req->isreply = 1;
449 request_send_nowait_locked(fc, req); 452 fuse_request_send_nowait_locked(fc, req);
450} 453}
451 454
452/* 455/*
@@ -539,8 +542,8 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
539 BUG_ON(!cs->nr_segs); 542 BUG_ON(!cs->nr_segs);
540 cs->seglen = cs->iov[0].iov_len; 543 cs->seglen = cs->iov[0].iov_len;
541 cs->addr = (unsigned long) cs->iov[0].iov_base; 544 cs->addr = (unsigned long) cs->iov[0].iov_base;
542 cs->iov ++; 545 cs->iov++;
543 cs->nr_segs --; 546 cs->nr_segs--;
544 } 547 }
545 down_read(&current->mm->mmap_sem); 548 down_read(&current->mm->mmap_sem);
546 err = get_user_pages(current, current->mm, cs->addr, 1, cs->write, 0, 549 err = get_user_pages(current, current->mm, cs->addr, 1, cs->write, 0,
@@ -589,9 +592,11 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page *page,
589 kunmap_atomic(mapaddr, KM_USER1); 592 kunmap_atomic(mapaddr, KM_USER1);
590 } 593 }
591 while (count) { 594 while (count) {
592 int err; 595 if (!cs->len) {
593 if (!cs->len && (err = fuse_copy_fill(cs))) 596 int err = fuse_copy_fill(cs);
594 return err; 597 if (err)
598 return err;
599 }
595 if (page) { 600 if (page) {
596 void *mapaddr = kmap_atomic(page, KM_USER1); 601 void *mapaddr = kmap_atomic(page, KM_USER1);
597 void *buf = mapaddr + offset; 602 void *buf = mapaddr + offset;
@@ -631,9 +636,11 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes,
631static int fuse_copy_one(struct fuse_copy_state *cs, void *val, unsigned size) 636static int fuse_copy_one(struct fuse_copy_state *cs, void *val, unsigned size)
632{ 637{
633 while (size) { 638 while (size) {
634 int err; 639 if (!cs->len) {
635 if (!cs->len && (err = fuse_copy_fill(cs))) 640 int err = fuse_copy_fill(cs);
636 return err; 641 if (err)
642 return err;
643 }
637 fuse_copy_do(cs, &val, &size); 644 fuse_copy_do(cs, &val, &size);
638 } 645 }
639 return 0; 646 return 0;
@@ -664,6 +671,8 @@ static int request_pending(struct fuse_conn *fc)
664 671
665/* Wait until a request is available on the pending list */ 672/* Wait until a request is available on the pending list */
666static void request_wait(struct fuse_conn *fc) 673static void request_wait(struct fuse_conn *fc)
674__releases(&fc->lock)
675__acquires(&fc->lock)
667{ 676{
668 DECLARE_WAITQUEUE(wait, current); 677 DECLARE_WAITQUEUE(wait, current);
669 678
@@ -691,7 +700,7 @@ static void request_wait(struct fuse_conn *fc)
691 */ 700 */
692static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_req *req, 701static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_req *req,
693 const struct iovec *iov, unsigned long nr_segs) 702 const struct iovec *iov, unsigned long nr_segs)
694 __releases(fc->lock) 703__releases(&fc->lock)
695{ 704{
696 struct fuse_copy_state cs; 705 struct fuse_copy_state cs;
697 struct fuse_in_header ih; 706 struct fuse_in_header ih;
@@ -813,6 +822,40 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
813 return err; 822 return err;
814} 823}
815 824
825static int fuse_notify_poll(struct fuse_conn *fc, unsigned int size,
826 struct fuse_copy_state *cs)
827{
828 struct fuse_notify_poll_wakeup_out outarg;
829 int err = -EINVAL;
830
831 if (size != sizeof(outarg))
832 goto err;
833
834 err = fuse_copy_one(cs, &outarg, sizeof(outarg));
835 if (err)
836 goto err;
837
838 fuse_copy_finish(cs);
839 return fuse_notify_poll_wakeup(fc, &outarg);
840
841err:
842 fuse_copy_finish(cs);
843 return err;
844}
845
846static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
847 unsigned int size, struct fuse_copy_state *cs)
848{
849 switch (code) {
850 case FUSE_NOTIFY_POLL:
851 return fuse_notify_poll(fc, size, cs);
852
853 default:
854 fuse_copy_finish(cs);
855 return -EINVAL;
856 }
857}
858
816/* Look up request on processing list by unique ID */ 859/* Look up request on processing list by unique ID */
817static struct fuse_req *request_find(struct fuse_conn *fc, u64 unique) 860static struct fuse_req *request_find(struct fuse_conn *fc, u64 unique)
818{ 861{
@@ -876,9 +919,22 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
876 err = fuse_copy_one(&cs, &oh, sizeof(oh)); 919 err = fuse_copy_one(&cs, &oh, sizeof(oh));
877 if (err) 920 if (err)
878 goto err_finish; 921 goto err_finish;
922
923 err = -EINVAL;
924 if (oh.len != nbytes)
925 goto err_finish;
926
927 /*
928 * Zero oh.unique indicates unsolicited notification message
929 * and error contains notification code.
930 */
931 if (!oh.unique) {
932 err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), &cs);
933 return err ? err : nbytes;
934 }
935
879 err = -EINVAL; 936 err = -EINVAL;
880 if (!oh.unique || oh.error <= -1000 || oh.error > 0 || 937 if (oh.error <= -1000 || oh.error > 0)
881 oh.len != nbytes)
882 goto err_finish; 938 goto err_finish;
883 939
884 spin_lock(&fc->lock); 940 spin_lock(&fc->lock);
@@ -966,6 +1022,8 @@ static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
966 * This function releases and reacquires fc->lock 1022 * This function releases and reacquires fc->lock
967 */ 1023 */
968static void end_requests(struct fuse_conn *fc, struct list_head *head) 1024static void end_requests(struct fuse_conn *fc, struct list_head *head)
1025__releases(&fc->lock)
1026__acquires(&fc->lock)
969{ 1027{
970 while (!list_empty(head)) { 1028 while (!list_empty(head)) {
971 struct fuse_req *req; 1029 struct fuse_req *req;
@@ -988,7 +1046,8 @@ static void end_requests(struct fuse_conn *fc, struct list_head *head)
988 * locked). 1046 * locked).
989 */ 1047 */
990static void end_io_requests(struct fuse_conn *fc) 1048static void end_io_requests(struct fuse_conn *fc)
991 __releases(fc->lock) __acquires(fc->lock) 1049__releases(&fc->lock)
1050__acquires(&fc->lock)
992{ 1051{
993 while (!list_empty(&fc->io)) { 1052 while (!list_empty(&fc->io)) {
994 struct fuse_req *req = 1053 struct fuse_req *req =
@@ -1002,11 +1061,11 @@ static void end_io_requests(struct fuse_conn *fc)
1002 wake_up(&req->waitq); 1061 wake_up(&req->waitq);
1003 if (end) { 1062 if (end) {
1004 req->end = NULL; 1063 req->end = NULL;
1005 /* The end function will consume this reference */
1006 __fuse_get_request(req); 1064 __fuse_get_request(req);
1007 spin_unlock(&fc->lock); 1065 spin_unlock(&fc->lock);
1008 wait_event(req->waitq, !req->locked); 1066 wait_event(req->waitq, !req->locked);
1009 end(fc, req); 1067 end(fc, req);
1068 fuse_put_request(fc, req);
1010 spin_lock(&fc->lock); 1069 spin_lock(&fc->lock);
1011 } 1070 }
1012 } 1071 }
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 95bc22bdd060..fdff346e96fd 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1,6 +1,6 @@
1/* 1/*
2 FUSE: Filesystem in Userspace 2 FUSE: Filesystem in Userspace
3 Copyright (C) 2001-2006 Miklos Szeredi <miklos@szeredi.hu> 3 Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu>
4 4
5 This program can be distributed under the terms of the GNU GPL. 5 This program can be distributed under the terms of the GNU GPL.
6 See the file COPYING. 6 See the file COPYING.
@@ -189,7 +189,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
189 parent = dget_parent(entry); 189 parent = dget_parent(entry);
190 fuse_lookup_init(fc, req, get_node_id(parent->d_inode), 190 fuse_lookup_init(fc, req, get_node_id(parent->d_inode),
191 &entry->d_name, &outarg); 191 &entry->d_name, &outarg);
192 request_send(fc, req); 192 fuse_request_send(fc, req);
193 dput(parent); 193 dput(parent);
194 err = req->out.h.error; 194 err = req->out.h.error;
195 fuse_put_request(fc, req); 195 fuse_put_request(fc, req);
@@ -204,7 +204,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
204 return 0; 204 return 0;
205 } 205 }
206 spin_lock(&fc->lock); 206 spin_lock(&fc->lock);
207 fi->nlookup ++; 207 fi->nlookup++;
208 spin_unlock(&fc->lock); 208 spin_unlock(&fc->lock);
209 } 209 }
210 fuse_put_request(fc, forget_req); 210 fuse_put_request(fc, forget_req);
@@ -283,7 +283,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
283 attr_version = fuse_get_attr_version(fc); 283 attr_version = fuse_get_attr_version(fc);
284 284
285 fuse_lookup_init(fc, req, nodeid, name, outarg); 285 fuse_lookup_init(fc, req, nodeid, name, outarg);
286 request_send(fc, req); 286 fuse_request_send(fc, req);
287 err = req->out.h.error; 287 err = req->out.h.error;
288 fuse_put_request(fc, req); 288 fuse_put_request(fc, req);
289 /* Zero nodeid is same as -ENOENT, but with valid timeout */ 289 /* Zero nodeid is same as -ENOENT, but with valid timeout */
@@ -369,7 +369,7 @@ static void fuse_sync_release(struct fuse_conn *fc, struct fuse_file *ff,
369{ 369{
370 fuse_release_fill(ff, nodeid, flags, FUSE_RELEASE); 370 fuse_release_fill(ff, nodeid, flags, FUSE_RELEASE);
371 ff->reserved_req->force = 1; 371 ff->reserved_req->force = 1;
372 request_send(fc, ff->reserved_req); 372 fuse_request_send(fc, ff->reserved_req);
373 fuse_put_request(fc, ff->reserved_req); 373 fuse_put_request(fc, ff->reserved_req);
374 kfree(ff); 374 kfree(ff);
375} 375}
@@ -408,7 +408,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
408 goto out_put_forget_req; 408 goto out_put_forget_req;
409 409
410 err = -ENOMEM; 410 err = -ENOMEM;
411 ff = fuse_file_alloc(); 411 ff = fuse_file_alloc(fc);
412 if (!ff) 412 if (!ff)
413 goto out_put_request; 413 goto out_put_request;
414 414
@@ -432,7 +432,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
432 req->out.args[0].value = &outentry; 432 req->out.args[0].value = &outentry;
433 req->out.args[1].size = sizeof(outopen); 433 req->out.args[1].size = sizeof(outopen);
434 req->out.args[1].value = &outopen; 434 req->out.args[1].value = &outopen;
435 request_send(fc, req); 435 fuse_request_send(fc, req);
436 err = req->out.h.error; 436 err = req->out.h.error;
437 if (err) { 437 if (err) {
438 if (err == -ENOSYS) 438 if (err == -ENOSYS)
@@ -502,7 +502,7 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
502 else 502 else
503 req->out.args[0].size = sizeof(outarg); 503 req->out.args[0].size = sizeof(outarg);
504 req->out.args[0].value = &outarg; 504 req->out.args[0].value = &outarg;
505 request_send(fc, req); 505 fuse_request_send(fc, req);
506 err = req->out.h.error; 506 err = req->out.h.error;
507 fuse_put_request(fc, req); 507 fuse_put_request(fc, req);
508 if (err) 508 if (err)
@@ -631,15 +631,17 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
631 req->in.numargs = 1; 631 req->in.numargs = 1;
632 req->in.args[0].size = entry->d_name.len + 1; 632 req->in.args[0].size = entry->d_name.len + 1;
633 req->in.args[0].value = entry->d_name.name; 633 req->in.args[0].value = entry->d_name.name;
634 request_send(fc, req); 634 fuse_request_send(fc, req);
635 err = req->out.h.error; 635 err = req->out.h.error;
636 fuse_put_request(fc, req); 636 fuse_put_request(fc, req);
637 if (!err) { 637 if (!err) {
638 struct inode *inode = entry->d_inode; 638 struct inode *inode = entry->d_inode;
639 639
640 /* Set nlink to zero so the inode can be cleared, if 640 /*
641 the inode does have more links this will be 641 * Set nlink to zero so the inode can be cleared, if the inode
642 discovered at the next lookup/getattr */ 642 * does have more links this will be discovered at the next
643 * lookup/getattr.
644 */
643 clear_nlink(inode); 645 clear_nlink(inode);
644 fuse_invalidate_attr(inode); 646 fuse_invalidate_attr(inode);
645 fuse_invalidate_attr(dir); 647 fuse_invalidate_attr(dir);
@@ -662,7 +664,7 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
662 req->in.numargs = 1; 664 req->in.numargs = 1;
663 req->in.args[0].size = entry->d_name.len + 1; 665 req->in.args[0].size = entry->d_name.len + 1;
664 req->in.args[0].value = entry->d_name.name; 666 req->in.args[0].value = entry->d_name.name;
665 request_send(fc, req); 667 fuse_request_send(fc, req);
666 err = req->out.h.error; 668 err = req->out.h.error;
667 fuse_put_request(fc, req); 669 fuse_put_request(fc, req);
668 if (!err) { 670 if (!err) {
@@ -695,7 +697,7 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent,
695 req->in.args[1].value = oldent->d_name.name; 697 req->in.args[1].value = oldent->d_name.name;
696 req->in.args[2].size = newent->d_name.len + 1; 698 req->in.args[2].size = newent->d_name.len + 1;
697 req->in.args[2].value = newent->d_name.name; 699 req->in.args[2].value = newent->d_name.name;
698 request_send(fc, req); 700 fuse_request_send(fc, req);
699 err = req->out.h.error; 701 err = req->out.h.error;
700 fuse_put_request(fc, req); 702 fuse_put_request(fc, req);
701 if (!err) { 703 if (!err) {
@@ -811,7 +813,7 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat,
811 else 813 else
812 req->out.args[0].size = sizeof(outarg); 814 req->out.args[0].size = sizeof(outarg);
813 req->out.args[0].value = &outarg; 815 req->out.args[0].value = &outarg;
814 request_send(fc, req); 816 fuse_request_send(fc, req);
815 err = req->out.h.error; 817 err = req->out.h.error;
816 fuse_put_request(fc, req); 818 fuse_put_request(fc, req);
817 if (!err) { 819 if (!err) {
@@ -911,7 +913,7 @@ static int fuse_access(struct inode *inode, int mask)
911 req->in.numargs = 1; 913 req->in.numargs = 1;
912 req->in.args[0].size = sizeof(inarg); 914 req->in.args[0].size = sizeof(inarg);
913 req->in.args[0].value = &inarg; 915 req->in.args[0].value = &inarg;
914 request_send(fc, req); 916 fuse_request_send(fc, req);
915 err = req->out.h.error; 917 err = req->out.h.error;
916 fuse_put_request(fc, req); 918 fuse_put_request(fc, req);
917 if (err == -ENOSYS) { 919 if (err == -ENOSYS) {
@@ -1033,7 +1035,7 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
1033 req->num_pages = 1; 1035 req->num_pages = 1;
1034 req->pages[0] = page; 1036 req->pages[0] = page;
1035 fuse_read_fill(req, file, inode, file->f_pos, PAGE_SIZE, FUSE_READDIR); 1037 fuse_read_fill(req, file, inode, file->f_pos, PAGE_SIZE, FUSE_READDIR);
1036 request_send(fc, req); 1038 fuse_request_send(fc, req);
1037 nbytes = req->out.args[0].size; 1039 nbytes = req->out.args[0].size;
1038 err = req->out.h.error; 1040 err = req->out.h.error;
1039 fuse_put_request(fc, req); 1041 fuse_put_request(fc, req);
@@ -1067,7 +1069,7 @@ static char *read_link(struct dentry *dentry)
1067 req->out.numargs = 1; 1069 req->out.numargs = 1;
1068 req->out.args[0].size = PAGE_SIZE - 1; 1070 req->out.args[0].size = PAGE_SIZE - 1;
1069 req->out.args[0].value = link; 1071 req->out.args[0].value = link;
1070 request_send(fc, req); 1072 fuse_request_send(fc, req);
1071 if (req->out.h.error) { 1073 if (req->out.h.error) {
1072 free_page((unsigned long) link); 1074 free_page((unsigned long) link);
1073 link = ERR_PTR(req->out.h.error); 1075 link = ERR_PTR(req->out.h.error);
@@ -1273,7 +1275,7 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr,
1273 else 1275 else
1274 req->out.args[0].size = sizeof(outarg); 1276 req->out.args[0].size = sizeof(outarg);
1275 req->out.args[0].value = &outarg; 1277 req->out.args[0].value = &outarg;
1276 request_send(fc, req); 1278 fuse_request_send(fc, req);
1277 err = req->out.h.error; 1279 err = req->out.h.error;
1278 fuse_put_request(fc, req); 1280 fuse_put_request(fc, req);
1279 if (err) { 1281 if (err) {
@@ -1367,7 +1369,7 @@ static int fuse_setxattr(struct dentry *entry, const char *name,
1367 req->in.args[1].value = name; 1369 req->in.args[1].value = name;
1368 req->in.args[2].size = size; 1370 req->in.args[2].size = size;
1369 req->in.args[2].value = value; 1371 req->in.args[2].value = value;
1370 request_send(fc, req); 1372 fuse_request_send(fc, req);
1371 err = req->out.h.error; 1373 err = req->out.h.error;
1372 fuse_put_request(fc, req); 1374 fuse_put_request(fc, req);
1373 if (err == -ENOSYS) { 1375 if (err == -ENOSYS) {
@@ -1413,7 +1415,7 @@ static ssize_t fuse_getxattr(struct dentry *entry, const char *name,
1413 req->out.args[0].size = sizeof(outarg); 1415 req->out.args[0].size = sizeof(outarg);
1414 req->out.args[0].value = &outarg; 1416 req->out.args[0].value = &outarg;
1415 } 1417 }
1416 request_send(fc, req); 1418 fuse_request_send(fc, req);
1417 ret = req->out.h.error; 1419 ret = req->out.h.error;
1418 if (!ret) 1420 if (!ret)
1419 ret = size ? req->out.args[0].size : outarg.size; 1421 ret = size ? req->out.args[0].size : outarg.size;
@@ -1463,7 +1465,7 @@ static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size)
1463 req->out.args[0].size = sizeof(outarg); 1465 req->out.args[0].size = sizeof(outarg);
1464 req->out.args[0].value = &outarg; 1466 req->out.args[0].value = &outarg;
1465 } 1467 }
1466 request_send(fc, req); 1468 fuse_request_send(fc, req);
1467 ret = req->out.h.error; 1469 ret = req->out.h.error;
1468 if (!ret) 1470 if (!ret)
1469 ret = size ? req->out.args[0].size : outarg.size; 1471 ret = size ? req->out.args[0].size : outarg.size;
@@ -1496,7 +1498,7 @@ static int fuse_removexattr(struct dentry *entry, const char *name)
1496 req->in.numargs = 1; 1498 req->in.numargs = 1;
1497 req->in.args[0].size = strlen(name) + 1; 1499 req->in.args[0].size = strlen(name) + 1;
1498 req->in.args[0].value = name; 1500 req->in.args[0].value = name;
1499 request_send(fc, req); 1501 fuse_request_send(fc, req);
1500 err = req->out.h.error; 1502 err = req->out.h.error;
1501 fuse_put_request(fc, req); 1503 fuse_put_request(fc, req);
1502 if (err == -ENOSYS) { 1504 if (err == -ENOSYS) {
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 34930a964b82..d9fdb7cec538 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1,6 +1,6 @@
1/* 1/*
2 FUSE: Filesystem in Userspace 2 FUSE: Filesystem in Userspace
3 Copyright (C) 2001-2006 Miklos Szeredi <miklos@szeredi.hu> 3 Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu>
4 4
5 This program can be distributed under the terms of the GNU GPL. 5 This program can be distributed under the terms of the GNU GPL.
6 See the file COPYING. 6 See the file COPYING.
@@ -39,14 +39,14 @@ static int fuse_send_open(struct inode *inode, struct file *file, int isdir,
39 req->out.numargs = 1; 39 req->out.numargs = 1;
40 req->out.args[0].size = sizeof(*outargp); 40 req->out.args[0].size = sizeof(*outargp);
41 req->out.args[0].value = outargp; 41 req->out.args[0].value = outargp;
42 request_send(fc, req); 42 fuse_request_send(fc, req);
43 err = req->out.h.error; 43 err = req->out.h.error;
44 fuse_put_request(fc, req); 44 fuse_put_request(fc, req);
45 45
46 return err; 46 return err;
47} 47}
48 48
49struct fuse_file *fuse_file_alloc(void) 49struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
50{ 50{
51 struct fuse_file *ff; 51 struct fuse_file *ff;
52 ff = kmalloc(sizeof(struct fuse_file), GFP_KERNEL); 52 ff = kmalloc(sizeof(struct fuse_file), GFP_KERNEL);
@@ -54,11 +54,16 @@ struct fuse_file *fuse_file_alloc(void)
54 ff->reserved_req = fuse_request_alloc(); 54 ff->reserved_req = fuse_request_alloc();
55 if (!ff->reserved_req) { 55 if (!ff->reserved_req) {
56 kfree(ff); 56 kfree(ff);
57 ff = NULL; 57 return NULL;
58 } else { 58 } else {
59 INIT_LIST_HEAD(&ff->write_entry); 59 INIT_LIST_HEAD(&ff->write_entry);
60 atomic_set(&ff->count, 0); 60 atomic_set(&ff->count, 0);
61 spin_lock(&fc->lock);
62 ff->kh = ++fc->khctr;
63 spin_unlock(&fc->lock);
61 } 64 }
65 RB_CLEAR_NODE(&ff->polled_node);
66 init_waitqueue_head(&ff->poll_wait);
62 } 67 }
63 return ff; 68 return ff;
64} 69}
@@ -79,7 +84,6 @@ static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req)
79{ 84{
80 dput(req->misc.release.dentry); 85 dput(req->misc.release.dentry);
81 mntput(req->misc.release.vfsmount); 86 mntput(req->misc.release.vfsmount);
82 fuse_put_request(fc, req);
83} 87}
84 88
85static void fuse_file_put(struct fuse_file *ff) 89static void fuse_file_put(struct fuse_file *ff)
@@ -89,7 +93,7 @@ static void fuse_file_put(struct fuse_file *ff)
89 struct inode *inode = req->misc.release.dentry->d_inode; 93 struct inode *inode = req->misc.release.dentry->d_inode;
90 struct fuse_conn *fc = get_fuse_conn(inode); 94 struct fuse_conn *fc = get_fuse_conn(inode);
91 req->end = fuse_release_end; 95 req->end = fuse_release_end;
92 request_send_background(fc, req); 96 fuse_request_send_background(fc, req);
93 kfree(ff); 97 kfree(ff);
94 } 98 }
95} 99}
@@ -109,6 +113,7 @@ void fuse_finish_open(struct inode *inode, struct file *file,
109 113
110int fuse_open_common(struct inode *inode, struct file *file, int isdir) 114int fuse_open_common(struct inode *inode, struct file *file, int isdir)
111{ 115{
116 struct fuse_conn *fc = get_fuse_conn(inode);
112 struct fuse_open_out outarg; 117 struct fuse_open_out outarg;
113 struct fuse_file *ff; 118 struct fuse_file *ff;
114 int err; 119 int err;
@@ -121,7 +126,7 @@ int fuse_open_common(struct inode *inode, struct file *file, int isdir)
121 if (err) 126 if (err)
122 return err; 127 return err;
123 128
124 ff = fuse_file_alloc(); 129 ff = fuse_file_alloc(fc);
125 if (!ff) 130 if (!ff)
126 return -ENOMEM; 131 return -ENOMEM;
127 132
@@ -167,7 +172,11 @@ int fuse_release_common(struct inode *inode, struct file *file, int isdir)
167 172
168 spin_lock(&fc->lock); 173 spin_lock(&fc->lock);
169 list_del(&ff->write_entry); 174 list_del(&ff->write_entry);
175 if (!RB_EMPTY_NODE(&ff->polled_node))
176 rb_erase(&ff->polled_node, &fc->polled_files);
170 spin_unlock(&fc->lock); 177 spin_unlock(&fc->lock);
178
179 wake_up_interruptible_sync(&ff->poll_wait);
171 /* 180 /*
172 * Normally this will send the RELEASE request, 181 * Normally this will send the RELEASE request,
173 * however if some asynchronous READ or WRITE requests 182 * however if some asynchronous READ or WRITE requests
@@ -280,7 +289,7 @@ static int fuse_flush(struct file *file, fl_owner_t id)
280 req->in.args[0].size = sizeof(inarg); 289 req->in.args[0].size = sizeof(inarg);
281 req->in.args[0].value = &inarg; 290 req->in.args[0].value = &inarg;
282 req->force = 1; 291 req->force = 1;
283 request_send(fc, req); 292 fuse_request_send(fc, req);
284 err = req->out.h.error; 293 err = req->out.h.error;
285 fuse_put_request(fc, req); 294 fuse_put_request(fc, req);
286 if (err == -ENOSYS) { 295 if (err == -ENOSYS) {
@@ -344,7 +353,7 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
344 req->in.numargs = 1; 353 req->in.numargs = 1;
345 req->in.args[0].size = sizeof(inarg); 354 req->in.args[0].size = sizeof(inarg);
346 req->in.args[0].value = &inarg; 355 req->in.args[0].value = &inarg;
347 request_send(fc, req); 356 fuse_request_send(fc, req);
348 err = req->out.h.error; 357 err = req->out.h.error;
349 fuse_put_request(fc, req); 358 fuse_put_request(fc, req);
350 if (err == -ENOSYS) { 359 if (err == -ENOSYS) {
@@ -396,7 +405,7 @@ static size_t fuse_send_read(struct fuse_req *req, struct file *file,
396 inarg->read_flags |= FUSE_READ_LOCKOWNER; 405 inarg->read_flags |= FUSE_READ_LOCKOWNER;
397 inarg->lock_owner = fuse_lock_owner_id(fc, owner); 406 inarg->lock_owner = fuse_lock_owner_id(fc, owner);
398 } 407 }
399 request_send(fc, req); 408 fuse_request_send(fc, req);
400 return req->out.args[0].size; 409 return req->out.args[0].size;
401} 410}
402 411
@@ -493,7 +502,6 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
493 } 502 }
494 if (req->ff) 503 if (req->ff)
495 fuse_file_put(req->ff); 504 fuse_file_put(req->ff);
496 fuse_put_request(fc, req);
497} 505}
498 506
499static void fuse_send_readpages(struct fuse_req *req, struct file *file, 507static void fuse_send_readpages(struct fuse_req *req, struct file *file,
@@ -509,10 +517,11 @@ static void fuse_send_readpages(struct fuse_req *req, struct file *file,
509 struct fuse_file *ff = file->private_data; 517 struct fuse_file *ff = file->private_data;
510 req->ff = fuse_file_get(ff); 518 req->ff = fuse_file_get(ff);
511 req->end = fuse_readpages_end; 519 req->end = fuse_readpages_end;
512 request_send_background(fc, req); 520 fuse_request_send_background(fc, req);
513 } else { 521 } else {
514 request_send(fc, req); 522 fuse_request_send(fc, req);
515 fuse_readpages_end(fc, req); 523 fuse_readpages_end(fc, req);
524 fuse_put_request(fc, req);
516 } 525 }
517} 526}
518 527
@@ -543,7 +552,7 @@ static int fuse_readpages_fill(void *_data, struct page *page)
543 } 552 }
544 } 553 }
545 req->pages[req->num_pages] = page; 554 req->pages[req->num_pages] = page;
546 req->num_pages ++; 555 req->num_pages++;
547 return 0; 556 return 0;
548} 557}
549 558
@@ -636,7 +645,7 @@ static size_t fuse_send_write(struct fuse_req *req, struct file *file,
636 inarg->write_flags |= FUSE_WRITE_LOCKOWNER; 645 inarg->write_flags |= FUSE_WRITE_LOCKOWNER;
637 inarg->lock_owner = fuse_lock_owner_id(fc, owner); 646 inarg->lock_owner = fuse_lock_owner_id(fc, owner);
638 } 647 }
639 request_send(fc, req); 648 fuse_request_send(fc, req);
640 return req->misc.write.out.size; 649 return req->misc.write.out.size;
641} 650}
642 651
@@ -646,7 +655,7 @@ static int fuse_write_begin(struct file *file, struct address_space *mapping,
646{ 655{
647 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 656 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
648 657
649 *pagep = __grab_cache_page(mapping, index); 658 *pagep = grab_cache_page_write_begin(mapping, index, flags);
650 if (!*pagep) 659 if (!*pagep)
651 return -ENOMEM; 660 return -ENOMEM;
652 return 0; 661 return 0;
@@ -779,7 +788,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
779 break; 788 break;
780 789
781 err = -ENOMEM; 790 err = -ENOMEM;
782 page = __grab_cache_page(mapping, index); 791 page = grab_cache_page_write_begin(mapping, index, 0);
783 if (!page) 792 if (!page)
784 break; 793 break;
785 794
@@ -1042,7 +1051,6 @@ static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req)
1042{ 1051{
1043 __free_page(req->pages[0]); 1052 __free_page(req->pages[0]);
1044 fuse_file_put(req->ff); 1053 fuse_file_put(req->ff);
1045 fuse_put_request(fc, req);
1046} 1054}
1047 1055
1048static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) 1056static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
@@ -1060,6 +1068,8 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
1060 1068
1061/* Called under fc->lock, may release and reacquire it */ 1069/* Called under fc->lock, may release and reacquire it */
1062static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req) 1070static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req)
1071__releases(&fc->lock)
1072__acquires(&fc->lock)
1063{ 1073{
1064 struct fuse_inode *fi = get_fuse_inode(req->inode); 1074 struct fuse_inode *fi = get_fuse_inode(req->inode);
1065 loff_t size = i_size_read(req->inode); 1075 loff_t size = i_size_read(req->inode);
@@ -1079,13 +1089,14 @@ static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req)
1079 1089
1080 req->in.args[1].size = inarg->size; 1090 req->in.args[1].size = inarg->size;
1081 fi->writectr++; 1091 fi->writectr++;
1082 request_send_background_locked(fc, req); 1092 fuse_request_send_background_locked(fc, req);
1083 return; 1093 return;
1084 1094
1085 out_free: 1095 out_free:
1086 fuse_writepage_finish(fc, req); 1096 fuse_writepage_finish(fc, req);
1087 spin_unlock(&fc->lock); 1097 spin_unlock(&fc->lock);
1088 fuse_writepage_free(fc, req); 1098 fuse_writepage_free(fc, req);
1099 fuse_put_request(fc, req);
1089 spin_lock(&fc->lock); 1100 spin_lock(&fc->lock);
1090} 1101}
1091 1102
@@ -1096,6 +1107,8 @@ static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req)
1096 * Called with fc->lock 1107 * Called with fc->lock
1097 */ 1108 */
1098void fuse_flush_writepages(struct inode *inode) 1109void fuse_flush_writepages(struct inode *inode)
1110__releases(&fc->lock)
1111__acquires(&fc->lock)
1099{ 1112{
1100 struct fuse_conn *fc = get_fuse_conn(inode); 1113 struct fuse_conn *fc = get_fuse_conn(inode);
1101 struct fuse_inode *fi = get_fuse_inode(inode); 1114 struct fuse_inode *fi = get_fuse_inode(inode);
@@ -1325,7 +1338,7 @@ static int fuse_getlk(struct file *file, struct file_lock *fl)
1325 req->out.numargs = 1; 1338 req->out.numargs = 1;
1326 req->out.args[0].size = sizeof(outarg); 1339 req->out.args[0].size = sizeof(outarg);
1327 req->out.args[0].value = &outarg; 1340 req->out.args[0].value = &outarg;
1328 request_send(fc, req); 1341 fuse_request_send(fc, req);
1329 err = req->out.h.error; 1342 err = req->out.h.error;
1330 fuse_put_request(fc, req); 1343 fuse_put_request(fc, req);
1331 if (!err) 1344 if (!err)
@@ -1357,7 +1370,7 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
1357 return PTR_ERR(req); 1370 return PTR_ERR(req);
1358 1371
1359 fuse_lk_fill(req, file, fl, opcode, pid, flock); 1372 fuse_lk_fill(req, file, fl, opcode, pid, flock);
1360 request_send(fc, req); 1373 fuse_request_send(fc, req);
1361 err = req->out.h.error; 1374 err = req->out.h.error;
1362 /* locking is restartable */ 1375 /* locking is restartable */
1363 if (err == -EINTR) 1376 if (err == -EINTR)
@@ -1433,7 +1446,7 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
1433 req->out.numargs = 1; 1446 req->out.numargs = 1;
1434 req->out.args[0].size = sizeof(outarg); 1447 req->out.args[0].size = sizeof(outarg);
1435 req->out.args[0].value = &outarg; 1448 req->out.args[0].value = &outarg;
1436 request_send(fc, req); 1449 fuse_request_send(fc, req);
1437 err = req->out.h.error; 1450 err = req->out.h.error;
1438 fuse_put_request(fc, req); 1451 fuse_put_request(fc, req);
1439 if (err == -ENOSYS) 1452 if (err == -ENOSYS)
@@ -1470,6 +1483,406 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int origin)
1470 return retval; 1483 return retval;
1471} 1484}
1472 1485
1486static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov,
1487 unsigned int nr_segs, size_t bytes, bool to_user)
1488{
1489 struct iov_iter ii;
1490 int page_idx = 0;
1491
1492 if (!bytes)
1493 return 0;
1494
1495 iov_iter_init(&ii, iov, nr_segs, bytes, 0);
1496
1497 while (iov_iter_count(&ii)) {
1498 struct page *page = pages[page_idx++];
1499 size_t todo = min_t(size_t, PAGE_SIZE, iov_iter_count(&ii));
1500 void *kaddr, *map;
1501
1502 kaddr = map = kmap(page);
1503
1504 while (todo) {
1505 char __user *uaddr = ii.iov->iov_base + ii.iov_offset;
1506 size_t iov_len = ii.iov->iov_len - ii.iov_offset;
1507 size_t copy = min(todo, iov_len);
1508 size_t left;
1509
1510 if (!to_user)
1511 left = copy_from_user(kaddr, uaddr, copy);
1512 else
1513 left = copy_to_user(uaddr, kaddr, copy);
1514
1515 if (unlikely(left))
1516 return -EFAULT;
1517
1518 iov_iter_advance(&ii, copy);
1519 todo -= copy;
1520 kaddr += copy;
1521 }
1522
1523 kunmap(map);
1524 }
1525
1526 return 0;
1527}
1528
1529/*
1530 * For ioctls, there is no generic way to determine how much memory
1531 * needs to be read and/or written. Furthermore, ioctls are allowed
1532 * to dereference the passed pointer, so the parameter requires deep
1533 * copying but FUSE has no idea whatsoever about what to copy in or
1534 * out.
1535 *
1536 * This is solved by allowing FUSE server to retry ioctl with
1537 * necessary in/out iovecs. Let's assume the ioctl implementation
1538 * needs to read in the following structure.
1539 *
1540 * struct a {
1541 * char *buf;
1542 * size_t buflen;
1543 * }
1544 *
1545 * On the first callout to FUSE server, inarg->in_size and
1546 * inarg->out_size will be NULL; then, the server completes the ioctl
1547 * with FUSE_IOCTL_RETRY set in out->flags, out->in_iovs set to 1 and
1548 * the actual iov array to
1549 *
1550 * { { .iov_base = inarg.arg, .iov_len = sizeof(struct a) } }
1551 *
1552 * which tells FUSE to copy in the requested area and retry the ioctl.
1553 * On the second round, the server has access to the structure and
1554 * from that it can tell what to look for next, so on the invocation,
1555 * it sets FUSE_IOCTL_RETRY, out->in_iovs to 2 and iov array to
1556 *
1557 * { { .iov_base = inarg.arg, .iov_len = sizeof(struct a) },
1558 * { .iov_base = a.buf, .iov_len = a.buflen } }
1559 *
1560 * FUSE will copy both struct a and the pointed buffer from the
1561 * process doing the ioctl and retry ioctl with both struct a and the
1562 * buffer.
1563 *
1564 * This time, FUSE server has everything it needs and completes ioctl
1565 * without FUSE_IOCTL_RETRY which finishes the ioctl call.
1566 *
1567 * Copying data out works the same way.
1568 *
1569 * Note that if FUSE_IOCTL_UNRESTRICTED is clear, the kernel
1570 * automatically initializes in and out iovs by decoding @cmd with
1571 * _IOC_* macros and the server is not allowed to request RETRY. This
1572 * limits ioctl data transfers to well-formed ioctls and is the forced
1573 * behavior for all FUSE servers.
1574 */
1575static long fuse_file_do_ioctl(struct file *file, unsigned int cmd,
1576 unsigned long arg, unsigned int flags)
1577{
1578 struct inode *inode = file->f_dentry->d_inode;
1579 struct fuse_file *ff = file->private_data;
1580 struct fuse_conn *fc = get_fuse_conn(inode);
1581 struct fuse_ioctl_in inarg = {
1582 .fh = ff->fh,
1583 .cmd = cmd,
1584 .arg = arg,
1585 .flags = flags
1586 };
1587 struct fuse_ioctl_out outarg;
1588 struct fuse_req *req = NULL;
1589 struct page **pages = NULL;
1590 struct page *iov_page = NULL;
1591 struct iovec *in_iov = NULL, *out_iov = NULL;
1592 unsigned int in_iovs = 0, out_iovs = 0, num_pages = 0, max_pages;
1593 size_t in_size, out_size, transferred;
1594 int err;
1595
1596 /* assume all the iovs returned by client always fits in a page */
1597 BUILD_BUG_ON(sizeof(struct iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
1598
1599 if (!fuse_allow_task(fc, current))
1600 return -EACCES;
1601
1602 err = -EIO;
1603 if (is_bad_inode(inode))
1604 goto out;
1605
1606 err = -ENOMEM;
1607 pages = kzalloc(sizeof(pages[0]) * FUSE_MAX_PAGES_PER_REQ, GFP_KERNEL);
1608 iov_page = alloc_page(GFP_KERNEL);
1609 if (!pages || !iov_page)
1610 goto out;
1611
1612 /*
1613 * If restricted, initialize IO parameters as encoded in @cmd.
1614 * RETRY from server is not allowed.
1615 */
1616 if (!(flags & FUSE_IOCTL_UNRESTRICTED)) {
1617 struct iovec *iov = page_address(iov_page);
1618
1619 iov->iov_base = (void __user *)arg;
1620 iov->iov_len = _IOC_SIZE(cmd);
1621
1622 if (_IOC_DIR(cmd) & _IOC_WRITE) {
1623 in_iov = iov;
1624 in_iovs = 1;
1625 }
1626
1627 if (_IOC_DIR(cmd) & _IOC_READ) {
1628 out_iov = iov;
1629 out_iovs = 1;
1630 }
1631 }
1632
1633 retry:
1634 inarg.in_size = in_size = iov_length(in_iov, in_iovs);
1635 inarg.out_size = out_size = iov_length(out_iov, out_iovs);
1636
1637 /*
1638 * Out data can be used either for actual out data or iovs,
1639 * make sure there always is at least one page.
1640 */
1641 out_size = max_t(size_t, out_size, PAGE_SIZE);
1642 max_pages = DIV_ROUND_UP(max(in_size, out_size), PAGE_SIZE);
1643
1644 /* make sure there are enough buffer pages and init request with them */
1645 err = -ENOMEM;
1646 if (max_pages > FUSE_MAX_PAGES_PER_REQ)
1647 goto out;
1648 while (num_pages < max_pages) {
1649 pages[num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
1650 if (!pages[num_pages])
1651 goto out;
1652 num_pages++;
1653 }
1654
1655 req = fuse_get_req(fc);
1656 if (IS_ERR(req)) {
1657 err = PTR_ERR(req);
1658 req = NULL;
1659 goto out;
1660 }
1661 memcpy(req->pages, pages, sizeof(req->pages[0]) * num_pages);
1662 req->num_pages = num_pages;
1663
1664 /* okay, let's send it to the client */
1665 req->in.h.opcode = FUSE_IOCTL;
1666 req->in.h.nodeid = get_node_id(inode);
1667 req->in.numargs = 1;
1668 req->in.args[0].size = sizeof(inarg);
1669 req->in.args[0].value = &inarg;
1670 if (in_size) {
1671 req->in.numargs++;
1672 req->in.args[1].size = in_size;
1673 req->in.argpages = 1;
1674
1675 err = fuse_ioctl_copy_user(pages, in_iov, in_iovs, in_size,
1676 false);
1677 if (err)
1678 goto out;
1679 }
1680
1681 req->out.numargs = 2;
1682 req->out.args[0].size = sizeof(outarg);
1683 req->out.args[0].value = &outarg;
1684 req->out.args[1].size = out_size;
1685 req->out.argpages = 1;
1686 req->out.argvar = 1;
1687
1688 fuse_request_send(fc, req);
1689 err = req->out.h.error;
1690 transferred = req->out.args[1].size;
1691 fuse_put_request(fc, req);
1692 req = NULL;
1693 if (err)
1694 goto out;
1695
1696 /* did it ask for retry? */
1697 if (outarg.flags & FUSE_IOCTL_RETRY) {
1698 char *vaddr;
1699
1700 /* no retry if in restricted mode */
1701 err = -EIO;
1702 if (!(flags & FUSE_IOCTL_UNRESTRICTED))
1703 goto out;
1704
1705 in_iovs = outarg.in_iovs;
1706 out_iovs = outarg.out_iovs;
1707
1708 /*
1709 * Make sure things are in boundary, separate checks
1710 * are to protect against overflow.
1711 */
1712 err = -ENOMEM;
1713 if (in_iovs > FUSE_IOCTL_MAX_IOV ||
1714 out_iovs > FUSE_IOCTL_MAX_IOV ||
1715 in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV)
1716 goto out;
1717
1718 err = -EIO;
1719 if ((in_iovs + out_iovs) * sizeof(struct iovec) != transferred)
1720 goto out;
1721
1722 /* okay, copy in iovs and retry */
1723 vaddr = kmap_atomic(pages[0], KM_USER0);
1724 memcpy(page_address(iov_page), vaddr, transferred);
1725 kunmap_atomic(vaddr, KM_USER0);
1726
1727 in_iov = page_address(iov_page);
1728 out_iov = in_iov + in_iovs;
1729
1730 goto retry;
1731 }
1732
1733 err = -EIO;
1734 if (transferred > inarg.out_size)
1735 goto out;
1736
1737 err = fuse_ioctl_copy_user(pages, out_iov, out_iovs, transferred, true);
1738 out:
1739 if (req)
1740 fuse_put_request(fc, req);
1741 if (iov_page)
1742 __free_page(iov_page);
1743 while (num_pages)
1744 __free_page(pages[--num_pages]);
1745 kfree(pages);
1746
1747 return err ? err : outarg.result;
1748}
1749
1750static long fuse_file_ioctl(struct file *file, unsigned int cmd,
1751 unsigned long arg)
1752{
1753 return fuse_file_do_ioctl(file, cmd, arg, 0);
1754}
1755
1756static long fuse_file_compat_ioctl(struct file *file, unsigned int cmd,
1757 unsigned long arg)
1758{
1759 return fuse_file_do_ioctl(file, cmd, arg, FUSE_IOCTL_COMPAT);
1760}
1761
1762/*
1763 * All files which have been polled are linked to RB tree
1764 * fuse_conn->polled_files which is indexed by kh. Walk the tree and
1765 * find the matching one.
1766 */
1767static struct rb_node **fuse_find_polled_node(struct fuse_conn *fc, u64 kh,
1768 struct rb_node **parent_out)
1769{
1770 struct rb_node **link = &fc->polled_files.rb_node;
1771 struct rb_node *last = NULL;
1772
1773 while (*link) {
1774 struct fuse_file *ff;
1775
1776 last = *link;
1777 ff = rb_entry(last, struct fuse_file, polled_node);
1778
1779 if (kh < ff->kh)
1780 link = &last->rb_left;
1781 else if (kh > ff->kh)
1782 link = &last->rb_right;
1783 else
1784 return link;
1785 }
1786
1787 if (parent_out)
1788 *parent_out = last;
1789 return link;
1790}
1791
1792/*
1793 * The file is about to be polled. Make sure it's on the polled_files
1794 * RB tree. Note that files once added to the polled_files tree are
1795 * not removed before the file is released. This is because a file
1796 * polled once is likely to be polled again.
1797 */
1798static void fuse_register_polled_file(struct fuse_conn *fc,
1799 struct fuse_file *ff)
1800{
1801 spin_lock(&fc->lock);
1802 if (RB_EMPTY_NODE(&ff->polled_node)) {
1803 struct rb_node **link, *parent;
1804
1805 link = fuse_find_polled_node(fc, ff->kh, &parent);
1806 BUG_ON(*link);
1807 rb_link_node(&ff->polled_node, parent, link);
1808 rb_insert_color(&ff->polled_node, &fc->polled_files);
1809 }
1810 spin_unlock(&fc->lock);
1811}
1812
1813static unsigned fuse_file_poll(struct file *file, poll_table *wait)
1814{
1815 struct inode *inode = file->f_dentry->d_inode;
1816 struct fuse_file *ff = file->private_data;
1817 struct fuse_conn *fc = get_fuse_conn(inode);
1818 struct fuse_poll_in inarg = { .fh = ff->fh, .kh = ff->kh };
1819 struct fuse_poll_out outarg;
1820 struct fuse_req *req;
1821 int err;
1822
1823 if (fc->no_poll)
1824 return DEFAULT_POLLMASK;
1825
1826 poll_wait(file, &ff->poll_wait, wait);
1827
1828 /*
1829 * Ask for notification iff there's someone waiting for it.
1830 * The client may ignore the flag and always notify.
1831 */
1832 if (waitqueue_active(&ff->poll_wait)) {
1833 inarg.flags |= FUSE_POLL_SCHEDULE_NOTIFY;
1834 fuse_register_polled_file(fc, ff);
1835 }
1836
1837 req = fuse_get_req(fc);
1838 if (IS_ERR(req))
1839 return PTR_ERR(req);
1840
1841 req->in.h.opcode = FUSE_POLL;
1842 req->in.h.nodeid = get_node_id(inode);
1843 req->in.numargs = 1;
1844 req->in.args[0].size = sizeof(inarg);
1845 req->in.args[0].value = &inarg;
1846 req->out.numargs = 1;
1847 req->out.args[0].size = sizeof(outarg);
1848 req->out.args[0].value = &outarg;
1849 fuse_request_send(fc, req);
1850 err = req->out.h.error;
1851 fuse_put_request(fc, req);
1852
1853 if (!err)
1854 return outarg.revents;
1855 if (err == -ENOSYS) {
1856 fc->no_poll = 1;
1857 return DEFAULT_POLLMASK;
1858 }
1859 return POLLERR;
1860}
1861
1862/*
1863 * This is called from fuse_handle_notify() on FUSE_NOTIFY_POLL and
1864 * wakes up the poll waiters.
1865 */
1866int fuse_notify_poll_wakeup(struct fuse_conn *fc,
1867 struct fuse_notify_poll_wakeup_out *outarg)
1868{
1869 u64 kh = outarg->kh;
1870 struct rb_node **link;
1871
1872 spin_lock(&fc->lock);
1873
1874 link = fuse_find_polled_node(fc, kh, NULL);
1875 if (*link) {
1876 struct fuse_file *ff;
1877
1878 ff = rb_entry(*link, struct fuse_file, polled_node);
1879 wake_up_interruptible_sync(&ff->poll_wait);
1880 }
1881
1882 spin_unlock(&fc->lock);
1883 return 0;
1884}
1885
1473static const struct file_operations fuse_file_operations = { 1886static const struct file_operations fuse_file_operations = {
1474 .llseek = fuse_file_llseek, 1887 .llseek = fuse_file_llseek,
1475 .read = do_sync_read, 1888 .read = do_sync_read,
@@ -1484,6 +1897,9 @@ static const struct file_operations fuse_file_operations = {
1484 .lock = fuse_file_lock, 1897 .lock = fuse_file_lock,
1485 .flock = fuse_file_flock, 1898 .flock = fuse_file_flock,
1486 .splice_read = generic_file_splice_read, 1899 .splice_read = generic_file_splice_read,
1900 .unlocked_ioctl = fuse_file_ioctl,
1901 .compat_ioctl = fuse_file_compat_ioctl,
1902 .poll = fuse_file_poll,
1487}; 1903};
1488 1904
1489static const struct file_operations fuse_direct_io_file_operations = { 1905static const struct file_operations fuse_direct_io_file_operations = {
@@ -1496,6 +1912,9 @@ static const struct file_operations fuse_direct_io_file_operations = {
1496 .fsync = fuse_fsync, 1912 .fsync = fuse_fsync,
1497 .lock = fuse_file_lock, 1913 .lock = fuse_file_lock,
1498 .flock = fuse_file_flock, 1914 .flock = fuse_file_flock,
1915 .unlocked_ioctl = fuse_file_ioctl,
1916 .compat_ioctl = fuse_file_compat_ioctl,
1917 .poll = fuse_file_poll,
1499 /* no mmap and splice_read */ 1918 /* no mmap and splice_read */
1500}; 1919};
1501 1920
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 35accfdd747f..5e64b815a5a1 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -1,6 +1,6 @@
1/* 1/*
2 FUSE: Filesystem in Userspace 2 FUSE: Filesystem in Userspace
3 Copyright (C) 2001-2006 Miklos Szeredi <miklos@szeredi.hu> 3 Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu>
4 4
5 This program can be distributed under the terms of the GNU GPL. 5 This program can be distributed under the terms of the GNU GPL.
6 See the file COPYING. 6 See the file COPYING.
@@ -19,6 +19,8 @@
19#include <linux/backing-dev.h> 19#include <linux/backing-dev.h>
20#include <linux/mutex.h> 20#include <linux/mutex.h>
21#include <linux/rwsem.h> 21#include <linux/rwsem.h>
22#include <linux/rbtree.h>
23#include <linux/poll.h>
22 24
23/** Max number of pages that can be used in a single read request */ 25/** Max number of pages that can be used in a single read request */
24#define FUSE_MAX_PAGES_PER_REQ 32 26#define FUSE_MAX_PAGES_PER_REQ 32
@@ -100,6 +102,9 @@ struct fuse_file {
100 /** Request reserved for flush and release */ 102 /** Request reserved for flush and release */
101 struct fuse_req *reserved_req; 103 struct fuse_req *reserved_req;
102 104
105 /** Kernel file handle guaranteed to be unique */
106 u64 kh;
107
103 /** File handle used by userspace */ 108 /** File handle used by userspace */
104 u64 fh; 109 u64 fh;
105 110
@@ -108,6 +113,12 @@ struct fuse_file {
108 113
109 /** Entry on inode's write_files list */ 114 /** Entry on inode's write_files list */
110 struct list_head write_entry; 115 struct list_head write_entry;
116
117 /** RB node to be linked on fuse_conn->polled_files */
118 struct rb_node polled_node;
119
120 /** Wait queue head for poll */
121 wait_queue_head_t poll_wait;
111}; 122};
112 123
113/** One input argument of a request */ 124/** One input argument of a request */
@@ -322,6 +333,12 @@ struct fuse_conn {
322 /** The list of requests under I/O */ 333 /** The list of requests under I/O */
323 struct list_head io; 334 struct list_head io;
324 335
336 /** The next unique kernel file handle */
337 u64 khctr;
338
339 /** rbtree of fuse_files waiting for poll events indexed by ph */
340 struct rb_root polled_files;
341
325 /** Number of requests currently in the background */ 342 /** Number of requests currently in the background */
326 unsigned num_background; 343 unsigned num_background;
327 344
@@ -355,19 +372,19 @@ struct fuse_conn {
355 /** Connection failed (version mismatch). Cannot race with 372 /** Connection failed (version mismatch). Cannot race with
356 setting other bitfields since it is only set once in INIT 373 setting other bitfields since it is only set once in INIT
357 reply, before any other request, and never cleared */ 374 reply, before any other request, and never cleared */
358 unsigned conn_error : 1; 375 unsigned conn_error:1;
359 376
360 /** Connection successful. Only set in INIT */ 377 /** Connection successful. Only set in INIT */
361 unsigned conn_init : 1; 378 unsigned conn_init:1;
362 379
363 /** Do readpages asynchronously? Only set in INIT */ 380 /** Do readpages asynchronously? Only set in INIT */
364 unsigned async_read : 1; 381 unsigned async_read:1;
365 382
366 /** Do not send separate SETATTR request before open(O_TRUNC) */ 383 /** Do not send separate SETATTR request before open(O_TRUNC) */
367 unsigned atomic_o_trunc : 1; 384 unsigned atomic_o_trunc:1;
368 385
369 /** Filesystem supports NFS exporting. Only set in INIT */ 386 /** Filesystem supports NFS exporting. Only set in INIT */
370 unsigned export_support : 1; 387 unsigned export_support:1;
371 388
372 /* 389 /*
373 * The following bitfields are only for optimization purposes 390 * The following bitfields are only for optimization purposes
@@ -375,43 +392,46 @@ struct fuse_conn {
375 */ 392 */
376 393
377 /** Is fsync not implemented by fs? */ 394 /** Is fsync not implemented by fs? */
378 unsigned no_fsync : 1; 395 unsigned no_fsync:1;
379 396
380 /** Is fsyncdir not implemented by fs? */ 397 /** Is fsyncdir not implemented by fs? */
381 unsigned no_fsyncdir : 1; 398 unsigned no_fsyncdir:1;
382 399
383 /** Is flush not implemented by fs? */ 400 /** Is flush not implemented by fs? */
384 unsigned no_flush : 1; 401 unsigned no_flush:1;
385 402
386 /** Is setxattr not implemented by fs? */ 403 /** Is setxattr not implemented by fs? */
387 unsigned no_setxattr : 1; 404 unsigned no_setxattr:1;
388 405
389 /** Is getxattr not implemented by fs? */ 406 /** Is getxattr not implemented by fs? */
390 unsigned no_getxattr : 1; 407 unsigned no_getxattr:1;
391 408
392 /** Is listxattr not implemented by fs? */ 409 /** Is listxattr not implemented by fs? */
393 unsigned no_listxattr : 1; 410 unsigned no_listxattr:1;
394 411
395 /** Is removexattr not implemented by fs? */ 412 /** Is removexattr not implemented by fs? */
396 unsigned no_removexattr : 1; 413 unsigned no_removexattr:1;
397 414
398 /** Are file locking primitives not implemented by fs? */ 415 /** Are file locking primitives not implemented by fs? */
399 unsigned no_lock : 1; 416 unsigned no_lock:1;
400 417
401 /** Is access not implemented by fs? */ 418 /** Is access not implemented by fs? */
402 unsigned no_access : 1; 419 unsigned no_access:1;
403 420
404 /** Is create not implemented by fs? */ 421 /** Is create not implemented by fs? */
405 unsigned no_create : 1; 422 unsigned no_create:1;
406 423
407 /** Is interrupt not implemented by fs? */ 424 /** Is interrupt not implemented by fs? */
408 unsigned no_interrupt : 1; 425 unsigned no_interrupt:1;
409 426
410 /** Is bmap not implemented by fs? */ 427 /** Is bmap not implemented by fs? */
411 unsigned no_bmap : 1; 428 unsigned no_bmap:1;
429
430 /** Is poll not implemented by fs? */
431 unsigned no_poll:1;
412 432
413 /** Do multi-page cached writes */ 433 /** Do multi-page cached writes */
414 unsigned big_writes : 1; 434 unsigned big_writes:1;
415 435
416 /** The number of requests waiting for completion */ 436 /** The number of requests waiting for completion */
417 atomic_t num_waiting; 437 atomic_t num_waiting;
@@ -445,6 +465,9 @@ struct fuse_conn {
445 465
446 /** Version counter for attribute changes */ 466 /** Version counter for attribute changes */
447 u64 attr_version; 467 u64 attr_version;
468
469 /** Called on final put */
470 void (*release)(struct fuse_conn *);
448}; 471};
449 472
450static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb) 473static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb)
@@ -499,7 +522,7 @@ void fuse_read_fill(struct fuse_req *req, struct file *file,
499 */ 522 */
500int fuse_open_common(struct inode *inode, struct file *file, int isdir); 523int fuse_open_common(struct inode *inode, struct file *file, int isdir);
501 524
502struct fuse_file *fuse_file_alloc(void); 525struct fuse_file *fuse_file_alloc(struct fuse_conn *fc);
503void fuse_file_free(struct fuse_file *ff); 526void fuse_file_free(struct fuse_file *ff);
504void fuse_finish_open(struct inode *inode, struct file *file, 527void fuse_finish_open(struct inode *inode, struct file *file,
505 struct fuse_file *ff, struct fuse_open_out *outarg); 528 struct fuse_file *ff, struct fuse_open_out *outarg);
@@ -519,6 +542,12 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
519 int isdir); 542 int isdir);
520 543
521/** 544/**
545 * Notify poll wakeup
546 */
547int fuse_notify_poll_wakeup(struct fuse_conn *fc,
548 struct fuse_notify_poll_wakeup_out *outarg);
549
550/**
522 * Initialize file operations on a regular file 551 * Initialize file operations on a regular file
523 */ 552 */
524void fuse_init_file_inode(struct inode *inode); 553void fuse_init_file_inode(struct inode *inode);
@@ -593,19 +622,20 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req);
593/** 622/**
594 * Send a request (synchronous) 623 * Send a request (synchronous)
595 */ 624 */
596void request_send(struct fuse_conn *fc, struct fuse_req *req); 625void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req);
597 626
598/** 627/**
599 * Send a request with no reply 628 * Send a request with no reply
600 */ 629 */
601void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req); 630void fuse_request_send_noreply(struct fuse_conn *fc, struct fuse_req *req);
602 631
603/** 632/**
604 * Send a request in the background 633 * Send a request in the background
605 */ 634 */
606void request_send_background(struct fuse_conn *fc, struct fuse_req *req); 635void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req);
607 636
608void request_send_background_locked(struct fuse_conn *fc, struct fuse_req *req); 637void fuse_request_send_background_locked(struct fuse_conn *fc,
638 struct fuse_req *req);
609 639
610/* Abort all requests */ 640/* Abort all requests */
611void fuse_abort_conn(struct fuse_conn *fc); 641void fuse_abort_conn(struct fuse_conn *fc);
@@ -623,6 +653,11 @@ void fuse_invalidate_entry_cache(struct dentry *entry);
623struct fuse_conn *fuse_conn_get(struct fuse_conn *fc); 653struct fuse_conn *fuse_conn_get(struct fuse_conn *fc);
624 654
625/** 655/**
656 * Initialize fuse_conn
657 */
658int fuse_conn_init(struct fuse_conn *fc, struct super_block *sb);
659
660/**
626 * Release reference to fuse_conn 661 * Release reference to fuse_conn
627 */ 662 */
628void fuse_conn_put(struct fuse_conn *fc); 663void fuse_conn_put(struct fuse_conn *fc);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 2e99f34b4435..459b73dd45e1 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1,6 +1,6 @@
1/* 1/*
2 FUSE: Filesystem in Userspace 2 FUSE: Filesystem in Userspace
3 Copyright (C) 2001-2006 Miklos Szeredi <miklos@szeredi.hu> 3 Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu>
4 4
5 This program can be distributed under the terms of the GNU GPL. 5 This program can be distributed under the terms of the GNU GPL.
6 See the file COPYING. 6 See the file COPYING.
@@ -37,10 +37,10 @@ struct fuse_mount_data {
37 unsigned rootmode; 37 unsigned rootmode;
38 unsigned user_id; 38 unsigned user_id;
39 unsigned group_id; 39 unsigned group_id;
40 unsigned fd_present : 1; 40 unsigned fd_present:1;
41 unsigned rootmode_present : 1; 41 unsigned rootmode_present:1;
42 unsigned user_id_present : 1; 42 unsigned user_id_present:1;
43 unsigned group_id_present : 1; 43 unsigned group_id_present:1;
44 unsigned flags; 44 unsigned flags;
45 unsigned max_read; 45 unsigned max_read;
46 unsigned blksize; 46 unsigned blksize;
@@ -94,7 +94,7 @@ void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req,
94 req->in.numargs = 1; 94 req->in.numargs = 1;
95 req->in.args[0].size = sizeof(struct fuse_forget_in); 95 req->in.args[0].size = sizeof(struct fuse_forget_in);
96 req->in.args[0].value = inarg; 96 req->in.args[0].value = inarg;
97 request_send_noreply(fc, req); 97 fuse_request_send_noreply(fc, req);
98} 98}
99 99
100static void fuse_clear_inode(struct inode *inode) 100static void fuse_clear_inode(struct inode *inode)
@@ -250,7 +250,7 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
250 250
251 fi = get_fuse_inode(inode); 251 fi = get_fuse_inode(inode);
252 spin_lock(&fc->lock); 252 spin_lock(&fc->lock);
253 fi->nlookup ++; 253 fi->nlookup++;
254 spin_unlock(&fc->lock); 254 spin_unlock(&fc->lock);
255 fuse_change_attributes(inode, attr, attr_valid, attr_version); 255 fuse_change_attributes(inode, attr, attr_valid, attr_version);
256 256
@@ -269,7 +269,7 @@ static void fuse_send_destroy(struct fuse_conn *fc)
269 fc->destroy_req = NULL; 269 fc->destroy_req = NULL;
270 req->in.h.opcode = FUSE_DESTROY; 270 req->in.h.opcode = FUSE_DESTROY;
271 req->force = 1; 271 req->force = 1;
272 request_send(fc, req); 272 fuse_request_send(fc, req);
273 fuse_put_request(fc, req); 273 fuse_put_request(fc, req);
274 } 274 }
275} 275}
@@ -292,6 +292,7 @@ static void fuse_put_super(struct super_block *sb)
292 list_del(&fc->entry); 292 list_del(&fc->entry);
293 fuse_ctl_remove_conn(fc); 293 fuse_ctl_remove_conn(fc);
294 mutex_unlock(&fuse_mutex); 294 mutex_unlock(&fuse_mutex);
295 bdi_destroy(&fc->bdi);
295 fuse_conn_put(fc); 296 fuse_conn_put(fc);
296} 297}
297 298
@@ -334,7 +335,7 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf)
334 req->out.args[0].size = 335 req->out.args[0].size =
335 fc->minor < 4 ? FUSE_COMPAT_STATFS_SIZE : sizeof(outarg); 336 fc->minor < 4 ? FUSE_COMPAT_STATFS_SIZE : sizeof(outarg);
336 req->out.args[0].value = &outarg; 337 req->out.args[0].value = &outarg;
337 request_send(fc, req); 338 fuse_request_send(fc, req);
338 err = req->out.h.error; 339 err = req->out.h.error;
339 if (!err) 340 if (!err)
340 convert_fuse_statfs(buf, &outarg.st); 341 convert_fuse_statfs(buf, &outarg.st);
@@ -462,68 +463,69 @@ static int fuse_show_options(struct seq_file *m, struct vfsmount *mnt)
462 return 0; 463 return 0;
463} 464}
464 465
465static struct fuse_conn *new_conn(struct super_block *sb) 466int fuse_conn_init(struct fuse_conn *fc, struct super_block *sb)
466{ 467{
467 struct fuse_conn *fc;
468 int err; 468 int err;
469 469
470 fc = kzalloc(sizeof(*fc), GFP_KERNEL); 470 memset(fc, 0, sizeof(*fc));
471 if (fc) { 471 spin_lock_init(&fc->lock);
472 spin_lock_init(&fc->lock); 472 mutex_init(&fc->inst_mutex);
473 mutex_init(&fc->inst_mutex); 473 atomic_set(&fc->count, 1);
474 atomic_set(&fc->count, 1); 474 init_waitqueue_head(&fc->waitq);
475 init_waitqueue_head(&fc->waitq); 475 init_waitqueue_head(&fc->blocked_waitq);
476 init_waitqueue_head(&fc->blocked_waitq); 476 init_waitqueue_head(&fc->reserved_req_waitq);
477 init_waitqueue_head(&fc->reserved_req_waitq); 477 INIT_LIST_HEAD(&fc->pending);
478 INIT_LIST_HEAD(&fc->pending); 478 INIT_LIST_HEAD(&fc->processing);
479 INIT_LIST_HEAD(&fc->processing); 479 INIT_LIST_HEAD(&fc->io);
480 INIT_LIST_HEAD(&fc->io); 480 INIT_LIST_HEAD(&fc->interrupts);
481 INIT_LIST_HEAD(&fc->interrupts); 481 INIT_LIST_HEAD(&fc->bg_queue);
482 INIT_LIST_HEAD(&fc->bg_queue); 482 INIT_LIST_HEAD(&fc->entry);
483 atomic_set(&fc->num_waiting, 0); 483 atomic_set(&fc->num_waiting, 0);
484 fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; 484 fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
485 fc->bdi.unplug_io_fn = default_unplug_io_fn; 485 fc->bdi.unplug_io_fn = default_unplug_io_fn;
486 /* fuse does it's own writeback accounting */ 486 /* fuse does it's own writeback accounting */
487 fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB; 487 fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB;
488 fc->dev = sb->s_dev; 488 fc->khctr = 0;
489 err = bdi_init(&fc->bdi); 489 fc->polled_files = RB_ROOT;
490 if (err) 490 fc->dev = sb->s_dev;
491 goto error_kfree; 491 err = bdi_init(&fc->bdi);
492 if (sb->s_bdev) { 492 if (err)
493 err = bdi_register(&fc->bdi, NULL, "%u:%u-fuseblk", 493 goto error_mutex_destroy;
494 MAJOR(fc->dev), MINOR(fc->dev)); 494 if (sb->s_bdev) {
495 } else { 495 err = bdi_register(&fc->bdi, NULL, "%u:%u-fuseblk",
496 err = bdi_register_dev(&fc->bdi, fc->dev); 496 MAJOR(fc->dev), MINOR(fc->dev));
497 } 497 } else {
498 if (err) 498 err = bdi_register_dev(&fc->bdi, fc->dev);
499 goto error_bdi_destroy;
500 /*
501 * For a single fuse filesystem use max 1% of dirty +
502 * writeback threshold.
503 *
504 * This gives about 1M of write buffer for memory maps on a
505 * machine with 1G and 10% dirty_ratio, which should be more
506 * than enough.
507 *
508 * Privileged users can raise it by writing to
509 *
510 * /sys/class/bdi/<bdi>/max_ratio
511 */
512 bdi_set_max_ratio(&fc->bdi, 1);
513 fc->reqctr = 0;
514 fc->blocked = 1;
515 fc->attr_version = 1;
516 get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
517 } 499 }
518 return fc; 500 if (err)
501 goto error_bdi_destroy;
502 /*
503 * For a single fuse filesystem use max 1% of dirty +
504 * writeback threshold.
505 *
506 * This gives about 1M of write buffer for memory maps on a
507 * machine with 1G and 10% dirty_ratio, which should be more
508 * than enough.
509 *
510 * Privileged users can raise it by writing to
511 *
512 * /sys/class/bdi/<bdi>/max_ratio
513 */
514 bdi_set_max_ratio(&fc->bdi, 1);
515 fc->reqctr = 0;
516 fc->blocked = 1;
517 fc->attr_version = 1;
518 get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
519 519
520error_bdi_destroy: 520 return 0;
521
522 error_bdi_destroy:
521 bdi_destroy(&fc->bdi); 523 bdi_destroy(&fc->bdi);
522error_kfree: 524 error_mutex_destroy:
523 mutex_destroy(&fc->inst_mutex); 525 mutex_destroy(&fc->inst_mutex);
524 kfree(fc); 526 return err;
525 return NULL;
526} 527}
528EXPORT_SYMBOL_GPL(fuse_conn_init);
527 529
528void fuse_conn_put(struct fuse_conn *fc) 530void fuse_conn_put(struct fuse_conn *fc)
529{ 531{
@@ -531,8 +533,7 @@ void fuse_conn_put(struct fuse_conn *fc)
531 if (fc->destroy_req) 533 if (fc->destroy_req)
532 fuse_request_free(fc->destroy_req); 534 fuse_request_free(fc->destroy_req);
533 mutex_destroy(&fc->inst_mutex); 535 mutex_destroy(&fc->inst_mutex);
534 bdi_destroy(&fc->bdi); 536 fc->release(fc);
535 kfree(fc);
536 } 537 }
537} 538}
538 539
@@ -542,7 +543,7 @@ struct fuse_conn *fuse_conn_get(struct fuse_conn *fc)
542 return fc; 543 return fc;
543} 544}
544 545
545static struct inode *get_root_inode(struct super_block *sb, unsigned mode) 546static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode)
546{ 547{
547 struct fuse_attr attr; 548 struct fuse_attr attr;
548 memset(&attr, 0, sizeof(attr)); 549 memset(&attr, 0, sizeof(attr));
@@ -553,8 +554,7 @@ static struct inode *get_root_inode(struct super_block *sb, unsigned mode)
553 return fuse_iget(sb, 1, 0, &attr, 0, 0); 554 return fuse_iget(sb, 1, 0, &attr, 0, 0);
554} 555}
555 556
556struct fuse_inode_handle 557struct fuse_inode_handle {
557{
558 u64 nodeid; 558 u64 nodeid;
559 u32 generation; 559 u32 generation;
560}; 560};
@@ -761,7 +761,6 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
761 fc->max_write = max_t(unsigned, 4096, fc->max_write); 761 fc->max_write = max_t(unsigned, 4096, fc->max_write);
762 fc->conn_init = 1; 762 fc->conn_init = 1;
763 } 763 }
764 fuse_put_request(fc, req);
765 fc->blocked = 0; 764 fc->blocked = 0;
766 wake_up_all(&fc->blocked_waitq); 765 wake_up_all(&fc->blocked_waitq);
767} 766}
@@ -787,7 +786,12 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
787 req->out.args[0].size = sizeof(struct fuse_init_out); 786 req->out.args[0].size = sizeof(struct fuse_init_out);
788 req->out.args[0].value = &req->misc.init_out; 787 req->out.args[0].value = &req->misc.init_out;
789 req->end = process_init_reply; 788 req->end = process_init_reply;
790 request_send_background(fc, req); 789 fuse_request_send_background(fc, req);
790}
791
792static void fuse_free_conn(struct fuse_conn *fc)
793{
794 kfree(fc);
791} 795}
792 796
793static int fuse_fill_super(struct super_block *sb, void *data, int silent) 797static int fuse_fill_super(struct super_block *sb, void *data, int silent)
@@ -801,16 +805,18 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
801 int err; 805 int err;
802 int is_bdev = sb->s_bdev != NULL; 806 int is_bdev = sb->s_bdev != NULL;
803 807
808 err = -EINVAL;
804 if (sb->s_flags & MS_MANDLOCK) 809 if (sb->s_flags & MS_MANDLOCK)
805 return -EINVAL; 810 goto err;
806 811
807 if (!parse_fuse_opt((char *) data, &d, is_bdev)) 812 if (!parse_fuse_opt((char *) data, &d, is_bdev))
808 return -EINVAL; 813 goto err;
809 814
810 if (is_bdev) { 815 if (is_bdev) {
811#ifdef CONFIG_BLOCK 816#ifdef CONFIG_BLOCK
817 err = -EINVAL;
812 if (!sb_set_blocksize(sb, d.blksize)) 818 if (!sb_set_blocksize(sb, d.blksize))
813 return -EINVAL; 819 goto err;
814#endif 820#endif
815 } else { 821 } else {
816 sb->s_blocksize = PAGE_CACHE_SIZE; 822 sb->s_blocksize = PAGE_CACHE_SIZE;
@@ -822,16 +828,25 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
822 sb->s_export_op = &fuse_export_operations; 828 sb->s_export_op = &fuse_export_operations;
823 829
824 file = fget(d.fd); 830 file = fget(d.fd);
831 err = -EINVAL;
825 if (!file) 832 if (!file)
826 return -EINVAL; 833 goto err;
827 834
828 if (file->f_op != &fuse_dev_operations) 835 if (file->f_op != &fuse_dev_operations)
829 return -EINVAL; 836 goto err_fput;
830 837
831 fc = new_conn(sb); 838 fc = kmalloc(sizeof(*fc), GFP_KERNEL);
839 err = -ENOMEM;
832 if (!fc) 840 if (!fc)
833 return -ENOMEM; 841 goto err_fput;
834 842
843 err = fuse_conn_init(fc, sb);
844 if (err) {
845 kfree(fc);
846 goto err_fput;
847 }
848
849 fc->release = fuse_free_conn;
835 fc->flags = d.flags; 850 fc->flags = d.flags;
836 fc->user_id = d.user_id; 851 fc->user_id = d.user_id;
837 fc->group_id = d.group_id; 852 fc->group_id = d.group_id;
@@ -841,14 +856,14 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
841 sb->s_fs_info = fc; 856 sb->s_fs_info = fc;
842 857
843 err = -ENOMEM; 858 err = -ENOMEM;
844 root = get_root_inode(sb, d.rootmode); 859 root = fuse_get_root_inode(sb, d.rootmode);
845 if (!root) 860 if (!root)
846 goto err; 861 goto err_put_conn;
847 862
848 root_dentry = d_alloc_root(root); 863 root_dentry = d_alloc_root(root);
849 if (!root_dentry) { 864 if (!root_dentry) {
850 iput(root); 865 iput(root);
851 goto err; 866 goto err_put_conn;
852 } 867 }
853 868
854 init_req = fuse_request_alloc(); 869 init_req = fuse_request_alloc();
@@ -892,9 +907,11 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
892 fuse_request_free(init_req); 907 fuse_request_free(init_req);
893 err_put_root: 908 err_put_root:
894 dput(root_dentry); 909 dput(root_dentry);
895 err: 910 err_put_conn:
896 fput(file);
897 fuse_conn_put(fc); 911 fuse_conn_put(fc);
912 err_fput:
913 fput(file);
914 err:
898 return err; 915 return err;
899} 916}
900 917
@@ -952,7 +969,7 @@ static inline void unregister_fuseblk(void)
952 969
953static void fuse_inode_init_once(void *foo) 970static void fuse_inode_init_once(void *foo)
954{ 971{
955 struct inode * inode = foo; 972 struct inode *inode = foo;
956 973
957 inode_init_once(inode); 974 inode_init_once(inode);
958} 975}
@@ -1031,7 +1048,7 @@ static int __init fuse_init(void)
1031{ 1048{
1032 int res; 1049 int res;
1033 1050
1034 printk("fuse init (API version %i.%i)\n", 1051 printk(KERN_INFO "fuse init (API version %i.%i)\n",
1035 FUSE_KERNEL_VERSION, FUSE_KERNEL_MINOR_VERSION); 1052 FUSE_KERNEL_VERSION, FUSE_KERNEL_MINOR_VERSION);
1036 1053
1037 INIT_LIST_HEAD(&fuse_conn_list); 1054 INIT_LIST_HEAD(&fuse_conn_list);
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index ab2f57e3fb87..e563a6449811 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -1,6 +1,6 @@
1config GFS2_FS 1config GFS2_FS
2 tristate "GFS2 file system support" 2 tristate "GFS2 file system support"
3 depends on EXPERIMENTAL && (64BIT || (LSF && LBD)) 3 depends on EXPERIMENTAL && (64BIT || LBD)
4 select FS_POSIX_ACL 4 select FS_POSIX_ACL
5 select CRC32 5 select CRC32
6 help 6 help
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index ec65851ec80a..c1b4ec6a9650 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -1,5 +1,5 @@
1obj-$(CONFIG_GFS2_FS) += gfs2.o 1obj-$(CONFIG_GFS2_FS) += gfs2.o
2gfs2-y := acl.o bmap.o daemon.o dir.o eaops.o eattr.o glock.o \ 2gfs2-y := acl.o bmap.o dir.o eaops.o eattr.o glock.o \
3 glops.o inode.o log.o lops.o locking.o main.o meta_io.o \ 3 glops.o inode.o log.o lops.o locking.o main.o meta_io.o \
4 mount.o ops_address.o ops_dentry.o ops_export.o ops_file.o \ 4 mount.o ops_address.o ops_dentry.o ops_export.o ops_file.o \
5 ops_fstype.o ops_inode.o ops_super.o quota.o \ 5 ops_fstype.o ops_inode.o ops_super.o quota.o \
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 3e9bd46f27e3..e335dceb6a4f 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -91,7 +91,7 @@ static int acl_get(struct gfs2_inode *ip, int access, struct posix_acl **acl,
91 struct gfs2_ea_location el_this; 91 struct gfs2_ea_location el_this;
92 int error; 92 int error;
93 93
94 if (!ip->i_di.di_eattr) 94 if (!ip->i_eattr)
95 return 0; 95 return 0;
96 96
97 memset(&er, 0, sizeof(struct gfs2_ea_request)); 97 memset(&er, 0, sizeof(struct gfs2_ea_request));
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index bec76b1c2bb0..11ffc56f1f81 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -75,9 +75,9 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
75 void *kaddr = kmap(page); 75 void *kaddr = kmap(page);
76 76
77 memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), 77 memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode),
78 ip->i_di.di_size); 78 ip->i_disksize);
79 memset(kaddr + ip->i_di.di_size, 0, 79 memset(kaddr + ip->i_disksize, 0,
80 PAGE_CACHE_SIZE - ip->i_di.di_size); 80 PAGE_CACHE_SIZE - ip->i_disksize);
81 kunmap(page); 81 kunmap(page);
82 82
83 SetPageUptodate(page); 83 SetPageUptodate(page);
@@ -132,7 +132,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
132 if (error) 132 if (error)
133 goto out; 133 goto out;
134 134
135 if (ip->i_di.di_size) { 135 if (ip->i_disksize) {
136 /* Get a free block, fill it with the stuffed data, 136 /* Get a free block, fill it with the stuffed data,
137 and write it out to disk */ 137 and write it out to disk */
138 138
@@ -159,7 +159,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
159 di = (struct gfs2_dinode *)dibh->b_data; 159 di = (struct gfs2_dinode *)dibh->b_data;
160 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); 160 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
161 161
162 if (ip->i_di.di_size) { 162 if (ip->i_disksize) {
163 *(__be64 *)(di + 1) = cpu_to_be64(block); 163 *(__be64 *)(di + 1) = cpu_to_be64(block);
164 gfs2_add_inode_blocks(&ip->i_inode, 1); 164 gfs2_add_inode_blocks(&ip->i_inode, 1);
165 di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode)); 165 di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
@@ -926,7 +926,7 @@ static int do_grow(struct gfs2_inode *ip, u64 size)
926 } 926 }
927 } 927 }
928 928
929 ip->i_di.di_size = size; 929 ip->i_disksize = size;
930 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; 930 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
931 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 931 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
932 gfs2_dinode_out(ip, dibh->b_data); 932 gfs2_dinode_out(ip, dibh->b_data);
@@ -1033,7 +1033,7 @@ static int trunc_start(struct gfs2_inode *ip, u64 size)
1033 goto out; 1033 goto out;
1034 1034
1035 if (gfs2_is_stuffed(ip)) { 1035 if (gfs2_is_stuffed(ip)) {
1036 ip->i_di.di_size = size; 1036 ip->i_disksize = size;
1037 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; 1037 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
1038 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 1038 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1039 gfs2_dinode_out(ip, dibh->b_data); 1039 gfs2_dinode_out(ip, dibh->b_data);
@@ -1045,9 +1045,9 @@ static int trunc_start(struct gfs2_inode *ip, u64 size)
1045 error = gfs2_block_truncate_page(ip->i_inode.i_mapping); 1045 error = gfs2_block_truncate_page(ip->i_inode.i_mapping);
1046 1046
1047 if (!error) { 1047 if (!error) {
1048 ip->i_di.di_size = size; 1048 ip->i_disksize = size;
1049 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; 1049 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
1050 ip->i_di.di_flags |= GFS2_DIF_TRUNC_IN_PROG; 1050 ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
1051 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 1051 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1052 gfs2_dinode_out(ip, dibh->b_data); 1052 gfs2_dinode_out(ip, dibh->b_data);
1053 } 1053 }
@@ -1114,13 +1114,13 @@ static int trunc_end(struct gfs2_inode *ip)
1114 if (error) 1114 if (error)
1115 goto out; 1115 goto out;
1116 1116
1117 if (!ip->i_di.di_size) { 1117 if (!ip->i_disksize) {
1118 ip->i_height = 0; 1118 ip->i_height = 0;
1119 ip->i_goal = ip->i_no_addr; 1119 ip->i_goal = ip->i_no_addr;
1120 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); 1120 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
1121 } 1121 }
1122 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; 1122 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
1123 ip->i_di.di_flags &= ~GFS2_DIF_TRUNC_IN_PROG; 1123 ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG;
1124 1124
1125 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 1125 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1126 gfs2_dinode_out(ip, dibh->b_data); 1126 gfs2_dinode_out(ip, dibh->b_data);
@@ -1205,9 +1205,9 @@ int gfs2_truncatei(struct gfs2_inode *ip, u64 size)
1205 if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), S_ISREG(ip->i_inode.i_mode))) 1205 if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), S_ISREG(ip->i_inode.i_mode)))
1206 return -EINVAL; 1206 return -EINVAL;
1207 1207
1208 if (size > ip->i_di.di_size) 1208 if (size > ip->i_disksize)
1209 error = do_grow(ip, size); 1209 error = do_grow(ip, size);
1210 else if (size < ip->i_di.di_size) 1210 else if (size < ip->i_disksize)
1211 error = do_shrink(ip, size); 1211 error = do_shrink(ip, size);
1212 else 1212 else
1213 /* update time stamps */ 1213 /* update time stamps */
@@ -1219,7 +1219,7 @@ int gfs2_truncatei(struct gfs2_inode *ip, u64 size)
1219int gfs2_truncatei_resume(struct gfs2_inode *ip) 1219int gfs2_truncatei_resume(struct gfs2_inode *ip)
1220{ 1220{
1221 int error; 1221 int error;
1222 error = trunc_dealloc(ip, ip->i_di.di_size); 1222 error = trunc_dealloc(ip, ip->i_disksize);
1223 if (!error) 1223 if (!error)
1224 error = trunc_end(ip); 1224 error = trunc_end(ip);
1225 return error; 1225 return error;
@@ -1231,35 +1231,6 @@ int gfs2_file_dealloc(struct gfs2_inode *ip)
1231} 1231}
1232 1232
1233/** 1233/**
1234 * gfs2_write_calc_reserv - calculate number of blocks needed to write to a file
1235 * @ip: the file
1236 * @len: the number of bytes to be written to the file
1237 * @data_blocks: returns the number of data blocks required
1238 * @ind_blocks: returns the number of indirect blocks required
1239 *
1240 */
1241
1242void gfs2_write_calc_reserv(struct gfs2_inode *ip, unsigned int len,
1243 unsigned int *data_blocks, unsigned int *ind_blocks)
1244{
1245 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1246 unsigned int tmp;
1247
1248 if (gfs2_is_dir(ip)) {
1249 *data_blocks = DIV_ROUND_UP(len, sdp->sd_jbsize) + 2;
1250 *ind_blocks = 3 * (sdp->sd_max_jheight - 1);
1251 } else {
1252 *data_blocks = (len >> sdp->sd_sb.sb_bsize_shift) + 3;
1253 *ind_blocks = 3 * (sdp->sd_max_height - 1);
1254 }
1255
1256 for (tmp = *data_blocks; tmp > sdp->sd_diptrs;) {
1257 tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
1258 *ind_blocks += tmp;
1259 }
1260}
1261
1262/**
1263 * gfs2_write_alloc_required - figure out if a write will require an allocation 1234 * gfs2_write_alloc_required - figure out if a write will require an allocation
1264 * @ip: the file being written to 1235 * @ip: the file being written to
1265 * @offset: the offset to write to 1236 * @offset: the offset to write to
@@ -1276,6 +1247,7 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
1276 struct buffer_head bh; 1247 struct buffer_head bh;
1277 unsigned int shift; 1248 unsigned int shift;
1278 u64 lblock, lblock_stop, size; 1249 u64 lblock, lblock_stop, size;
1250 u64 end_of_file;
1279 1251
1280 *alloc_required = 0; 1252 *alloc_required = 0;
1281 1253
@@ -1291,19 +1263,12 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
1291 1263
1292 *alloc_required = 1; 1264 *alloc_required = 1;
1293 shift = sdp->sd_sb.sb_bsize_shift; 1265 shift = sdp->sd_sb.sb_bsize_shift;
1294 if (gfs2_is_dir(ip)) { 1266 BUG_ON(gfs2_is_dir(ip));
1295 unsigned int bsize = sdp->sd_jbsize; 1267 end_of_file = (ip->i_disksize + sdp->sd_sb.sb_bsize - 1) >> shift;
1296 lblock = offset; 1268 lblock = offset >> shift;
1297 do_div(lblock, bsize); 1269 lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
1298 lblock_stop = offset + len + bsize - 1; 1270 if (lblock_stop > end_of_file)
1299 do_div(lblock_stop, bsize); 1271 return 0;
1300 } else {
1301 u64 end_of_file = (ip->i_di.di_size + sdp->sd_sb.sb_bsize - 1) >> shift;
1302 lblock = offset >> shift;
1303 lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
1304 if (lblock_stop > end_of_file)
1305 return 0;
1306 }
1307 1272
1308 size = (lblock_stop - lblock) << shift; 1273 size = (lblock_stop - lblock) << shift;
1309 do { 1274 do {
diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h
index 4e6cde2943bd..c983177e05ac 100644
--- a/fs/gfs2/bmap.h
+++ b/fs/gfs2/bmap.h
@@ -10,10 +10,40 @@
10#ifndef __BMAP_DOT_H__ 10#ifndef __BMAP_DOT_H__
11#define __BMAP_DOT_H__ 11#define __BMAP_DOT_H__
12 12
13#include "inode.h"
14
13struct inode; 15struct inode;
14struct gfs2_inode; 16struct gfs2_inode;
15struct page; 17struct page;
16 18
19
20/**
21 * gfs2_write_calc_reserv - calculate number of blocks needed to write to a file
22 * @ip: the file
23 * @len: the number of bytes to be written to the file
24 * @data_blocks: returns the number of data blocks required
25 * @ind_blocks: returns the number of indirect blocks required
26 *
27 */
28
29static inline void gfs2_write_calc_reserv(const struct gfs2_inode *ip,
30 unsigned int len,
31 unsigned int *data_blocks,
32 unsigned int *ind_blocks)
33{
34 const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
35 unsigned int tmp;
36
37 BUG_ON(gfs2_is_dir(ip));
38 *data_blocks = (len >> sdp->sd_sb.sb_bsize_shift) + 3;
39 *ind_blocks = 3 * (sdp->sd_max_height - 1);
40
41 for (tmp = *data_blocks; tmp > sdp->sd_diptrs;) {
42 tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
43 *ind_blocks += tmp;
44 }
45}
46
17int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page); 47int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page);
18int gfs2_block_map(struct inode *inode, sector_t lblock, struct buffer_head *bh, int create); 48int gfs2_block_map(struct inode *inode, sector_t lblock, struct buffer_head *bh, int create);
19int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen); 49int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen);
@@ -21,10 +51,6 @@ int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsi
21int gfs2_truncatei(struct gfs2_inode *ip, u64 size); 51int gfs2_truncatei(struct gfs2_inode *ip, u64 size);
22int gfs2_truncatei_resume(struct gfs2_inode *ip); 52int gfs2_truncatei_resume(struct gfs2_inode *ip);
23int gfs2_file_dealloc(struct gfs2_inode *ip); 53int gfs2_file_dealloc(struct gfs2_inode *ip);
24
25void gfs2_write_calc_reserv(struct gfs2_inode *ip, unsigned int len,
26 unsigned int *data_blocks,
27 unsigned int *ind_blocks);
28int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, 54int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
29 unsigned int len, int *alloc_required); 55 unsigned int len, int *alloc_required);
30 56
diff --git a/fs/gfs2/daemon.c b/fs/gfs2/daemon.c
deleted file mode 100644
index e51991947d2c..000000000000
--- a/fs/gfs2/daemon.c
+++ /dev/null
@@ -1,136 +0,0 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/sched.h>
11#include <linux/slab.h>
12#include <linux/spinlock.h>
13#include <linux/completion.h>
14#include <linux/buffer_head.h>
15#include <linux/kthread.h>
16#include <linux/delay.h>
17#include <linux/gfs2_ondisk.h>
18#include <linux/lm_interface.h>
19#include <linux/freezer.h>
20
21#include "gfs2.h"
22#include "incore.h"
23#include "daemon.h"
24#include "glock.h"
25#include "log.h"
26#include "quota.h"
27#include "recovery.h"
28#include "super.h"
29#include "util.h"
30
31/* This uses schedule_timeout() instead of msleep() because it's good for
32 the daemons to wake up more often than the timeout when unmounting so
33 the user's unmount doesn't sit there forever.
34
35 The kthread functions used to start these daemons block and flush signals. */
36
37/**
38 * gfs2_glockd - Reclaim unused glock structures
39 * @sdp: Pointer to GFS2 superblock
40 *
41 * One or more of these daemons run, reclaiming glocks on sd_reclaim_list.
42 * Number of daemons can be set by user, with num_glockd mount option.
43 */
44
45int gfs2_glockd(void *data)
46{
47 struct gfs2_sbd *sdp = data;
48
49 while (!kthread_should_stop()) {
50 while (atomic_read(&sdp->sd_reclaim_count))
51 gfs2_reclaim_glock(sdp);
52
53 wait_event_interruptible(sdp->sd_reclaim_wq,
54 (atomic_read(&sdp->sd_reclaim_count) ||
55 kthread_should_stop()));
56 if (freezing(current))
57 refrigerator();
58 }
59
60 return 0;
61}
62
63/**
64 * gfs2_recoverd - Recover dead machine's journals
65 * @sdp: Pointer to GFS2 superblock
66 *
67 */
68
69int gfs2_recoverd(void *data)
70{
71 struct gfs2_sbd *sdp = data;
72 unsigned long t;
73
74 while (!kthread_should_stop()) {
75 gfs2_check_journals(sdp);
76 t = gfs2_tune_get(sdp, gt_recoverd_secs) * HZ;
77 if (freezing(current))
78 refrigerator();
79 schedule_timeout_interruptible(t);
80 }
81
82 return 0;
83}
84
85/**
86 * gfs2_quotad - Write cached quota changes into the quota file
87 * @sdp: Pointer to GFS2 superblock
88 *
89 */
90
91int gfs2_quotad(void *data)
92{
93 struct gfs2_sbd *sdp = data;
94 unsigned long t;
95 int error;
96
97 while (!kthread_should_stop()) {
98 /* Update the master statfs file */
99
100 t = sdp->sd_statfs_sync_time +
101 gfs2_tune_get(sdp, gt_statfs_quantum) * HZ;
102
103 if (time_after_eq(jiffies, t)) {
104 error = gfs2_statfs_sync(sdp);
105 if (error &&
106 error != -EROFS &&
107 !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
108 fs_err(sdp, "quotad: (1) error=%d\n", error);
109 sdp->sd_statfs_sync_time = jiffies;
110 }
111
112 /* Update quota file */
113
114 t = sdp->sd_quota_sync_time +
115 gfs2_tune_get(sdp, gt_quota_quantum) * HZ;
116
117 if (time_after_eq(jiffies, t)) {
118 error = gfs2_quota_sync(sdp);
119 if (error &&
120 error != -EROFS &&
121 !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
122 fs_err(sdp, "quotad: (2) error=%d\n", error);
123 sdp->sd_quota_sync_time = jiffies;
124 }
125
126 gfs2_quota_scan(sdp);
127
128 t = gfs2_tune_get(sdp, gt_quotad_secs) * HZ;
129 if (freezing(current))
130 refrigerator();
131 schedule_timeout_interruptible(t);
132 }
133
134 return 0;
135}
136
diff --git a/fs/gfs2/daemon.h b/fs/gfs2/daemon.h
deleted file mode 100644
index 4be084fb6a62..000000000000
--- a/fs/gfs2/daemon.h
+++ /dev/null
@@ -1,17 +0,0 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __DAEMON_DOT_H__
11#define __DAEMON_DOT_H__
12
13int gfs2_glockd(void *data);
14int gfs2_recoverd(void *data);
15int gfs2_quotad(void *data);
16
17#endif /* __DAEMON_DOT_H__ */
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index eed040d8ba3a..b7c8e5c70791 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -36,7 +36,7 @@
36 * the block. In leaves, they begin at offset sizeof(struct gfs2_leaf) from the 36 * the block. In leaves, they begin at offset sizeof(struct gfs2_leaf) from the
37 * beginning of the leaf block. The dirents reside in leaves when 37 * beginning of the leaf block. The dirents reside in leaves when
38 * 38 *
39 * dip->i_di.di_flags & GFS2_DIF_EXHASH is true 39 * dip->i_diskflags & GFS2_DIF_EXHASH is true
40 * 40 *
41 * Otherwise, the dirents are "linear", within a single stuffed dinode block. 41 * Otherwise, the dirents are "linear", within a single stuffed dinode block.
42 * 42 *
@@ -128,8 +128,8 @@ static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf,
128 128
129 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 129 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
130 memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size); 130 memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size);
131 if (ip->i_di.di_size < offset + size) 131 if (ip->i_disksize < offset + size)
132 ip->i_di.di_size = offset + size; 132 ip->i_disksize = offset + size;
133 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; 133 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
134 gfs2_dinode_out(ip, dibh->b_data); 134 gfs2_dinode_out(ip, dibh->b_data);
135 135
@@ -226,8 +226,8 @@ out:
226 if (error) 226 if (error)
227 return error; 227 return error;
228 228
229 if (ip->i_di.di_size < offset + copied) 229 if (ip->i_disksize < offset + copied)
230 ip->i_di.di_size = offset + copied; 230 ip->i_disksize = offset + copied;
231 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; 231 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
232 232
233 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 233 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
@@ -277,11 +277,11 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset,
277 int copied = 0; 277 int copied = 0;
278 int error = 0; 278 int error = 0;
279 279
280 if (offset >= ip->i_di.di_size) 280 if (offset >= ip->i_disksize)
281 return 0; 281 return 0;
282 282
283 if (offset + size > ip->i_di.di_size) 283 if (offset + size > ip->i_disksize)
284 size = ip->i_di.di_size - offset; 284 size = ip->i_disksize - offset;
285 285
286 if (!size) 286 if (!size)
287 return 0; 287 return 0;
@@ -755,12 +755,12 @@ static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode,
755 struct gfs2_inode *ip = GFS2_I(inode); 755 struct gfs2_inode *ip = GFS2_I(inode);
756 int error; 756 int error;
757 757
758 if (ip->i_di.di_flags & GFS2_DIF_EXHASH) { 758 if (ip->i_diskflags & GFS2_DIF_EXHASH) {
759 struct gfs2_leaf *leaf; 759 struct gfs2_leaf *leaf;
760 unsigned hsize = 1 << ip->i_depth; 760 unsigned hsize = 1 << ip->i_depth;
761 unsigned index; 761 unsigned index;
762 u64 ln; 762 u64 ln;
763 if (hsize * sizeof(u64) != ip->i_di.di_size) { 763 if (hsize * sizeof(u64) != ip->i_disksize) {
764 gfs2_consist_inode(ip); 764 gfs2_consist_inode(ip);
765 return ERR_PTR(-EIO); 765 return ERR_PTR(-EIO);
766 } 766 }
@@ -858,8 +858,8 @@ static int dir_make_exhash(struct inode *inode)
858 return -ENOSPC; 858 return -ENOSPC;
859 bn = bh->b_blocknr; 859 bn = bh->b_blocknr;
860 860
861 gfs2_assert(sdp, dip->i_di.di_entries < (1 << 16)); 861 gfs2_assert(sdp, dip->i_entries < (1 << 16));
862 leaf->lf_entries = cpu_to_be16(dip->i_di.di_entries); 862 leaf->lf_entries = cpu_to_be16(dip->i_entries);
863 863
864 /* Copy dirents */ 864 /* Copy dirents */
865 865
@@ -905,9 +905,9 @@ static int dir_make_exhash(struct inode *inode)
905 for (x = sdp->sd_hash_ptrs; x--; lp++) 905 for (x = sdp->sd_hash_ptrs; x--; lp++)
906 *lp = cpu_to_be64(bn); 906 *lp = cpu_to_be64(bn);
907 907
908 dip->i_di.di_size = sdp->sd_sb.sb_bsize / 2; 908 dip->i_disksize = sdp->sd_sb.sb_bsize / 2;
909 gfs2_add_inode_blocks(&dip->i_inode, 1); 909 gfs2_add_inode_blocks(&dip->i_inode, 1);
910 dip->i_di.di_flags |= GFS2_DIF_EXHASH; 910 dip->i_diskflags |= GFS2_DIF_EXHASH;
911 911
912 for (x = sdp->sd_hash_ptrs, y = -1; x; x >>= 1, y++) ; 912 for (x = sdp->sd_hash_ptrs, y = -1; x; x >>= 1, y++) ;
913 dip->i_depth = y; 913 dip->i_depth = y;
@@ -1082,7 +1082,7 @@ static int dir_double_exhash(struct gfs2_inode *dip)
1082 int error = 0; 1082 int error = 0;
1083 1083
1084 hsize = 1 << dip->i_depth; 1084 hsize = 1 << dip->i_depth;
1085 if (hsize * sizeof(u64) != dip->i_di.di_size) { 1085 if (hsize * sizeof(u64) != dip->i_disksize) {
1086 gfs2_consist_inode(dip); 1086 gfs2_consist_inode(dip);
1087 return -EIO; 1087 return -EIO;
1088 } 1088 }
@@ -1091,7 +1091,7 @@ static int dir_double_exhash(struct gfs2_inode *dip)
1091 1091
1092 buf = kcalloc(3, sdp->sd_hash_bsize, GFP_NOFS | __GFP_NOFAIL); 1092 buf = kcalloc(3, sdp->sd_hash_bsize, GFP_NOFS | __GFP_NOFAIL);
1093 1093
1094 for (block = dip->i_di.di_size >> sdp->sd_hash_bsize_shift; block--;) { 1094 for (block = dip->i_disksize >> sdp->sd_hash_bsize_shift; block--;) {
1095 error = gfs2_dir_read_data(dip, (char *)buf, 1095 error = gfs2_dir_read_data(dip, (char *)buf,
1096 block * sdp->sd_hash_bsize, 1096 block * sdp->sd_hash_bsize,
1097 sdp->sd_hash_bsize, 1); 1097 sdp->sd_hash_bsize, 1);
@@ -1370,7 +1370,7 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
1370 unsigned depth = 0; 1370 unsigned depth = 0;
1371 1371
1372 hsize = 1 << dip->i_depth; 1372 hsize = 1 << dip->i_depth;
1373 if (hsize * sizeof(u64) != dip->i_di.di_size) { 1373 if (hsize * sizeof(u64) != dip->i_disksize) {
1374 gfs2_consist_inode(dip); 1374 gfs2_consist_inode(dip);
1375 return -EIO; 1375 return -EIO;
1376 } 1376 }
@@ -1426,10 +1426,10 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
1426 int copied = 0; 1426 int copied = 0;
1427 int error; 1427 int error;
1428 1428
1429 if (!dip->i_di.di_entries) 1429 if (!dip->i_entries)
1430 return 0; 1430 return 0;
1431 1431
1432 if (dip->i_di.di_flags & GFS2_DIF_EXHASH) 1432 if (dip->i_diskflags & GFS2_DIF_EXHASH)
1433 return dir_e_read(inode, offset, opaque, filldir); 1433 return dir_e_read(inode, offset, opaque, filldir);
1434 1434
1435 if (!gfs2_is_stuffed(dip)) { 1435 if (!gfs2_is_stuffed(dip)) {
@@ -1453,17 +1453,17 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
1453 error = PTR_ERR(dent); 1453 error = PTR_ERR(dent);
1454 goto out; 1454 goto out;
1455 } 1455 }
1456 if (dip->i_di.di_entries != g.offset) { 1456 if (dip->i_entries != g.offset) {
1457 fs_warn(sdp, "Number of entries corrupt in dir %llu, " 1457 fs_warn(sdp, "Number of entries corrupt in dir %llu, "
1458 "ip->i_di.di_entries (%u) != g.offset (%u)\n", 1458 "ip->i_entries (%u) != g.offset (%u)\n",
1459 (unsigned long long)dip->i_no_addr, 1459 (unsigned long long)dip->i_no_addr,
1460 dip->i_di.di_entries, 1460 dip->i_entries,
1461 g.offset); 1461 g.offset);
1462 error = -EIO; 1462 error = -EIO;
1463 goto out; 1463 goto out;
1464 } 1464 }
1465 error = do_filldir_main(dip, offset, opaque, filldir, darr, 1465 error = do_filldir_main(dip, offset, opaque, filldir, darr,
1466 dip->i_di.di_entries, &copied); 1466 dip->i_entries, &copied);
1467out: 1467out:
1468 kfree(darr); 1468 kfree(darr);
1469 } 1469 }
@@ -1612,7 +1612,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name,
1612 dent = gfs2_init_dirent(inode, dent, name, bh); 1612 dent = gfs2_init_dirent(inode, dent, name, bh);
1613 gfs2_inum_out(nip, dent); 1613 gfs2_inum_out(nip, dent);
1614 dent->de_type = cpu_to_be16(type); 1614 dent->de_type = cpu_to_be16(type);
1615 if (ip->i_di.di_flags & GFS2_DIF_EXHASH) { 1615 if (ip->i_diskflags & GFS2_DIF_EXHASH) {
1616 leaf = (struct gfs2_leaf *)bh->b_data; 1616 leaf = (struct gfs2_leaf *)bh->b_data;
1617 be16_add_cpu(&leaf->lf_entries, 1); 1617 be16_add_cpu(&leaf->lf_entries, 1);
1618 } 1618 }
@@ -1621,14 +1621,14 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name,
1621 if (error) 1621 if (error)
1622 break; 1622 break;
1623 gfs2_trans_add_bh(ip->i_gl, bh, 1); 1623 gfs2_trans_add_bh(ip->i_gl, bh, 1);
1624 ip->i_di.di_entries++; 1624 ip->i_entries++;
1625 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; 1625 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
1626 gfs2_dinode_out(ip, bh->b_data); 1626 gfs2_dinode_out(ip, bh->b_data);
1627 brelse(bh); 1627 brelse(bh);
1628 error = 0; 1628 error = 0;
1629 break; 1629 break;
1630 } 1630 }
1631 if (!(ip->i_di.di_flags & GFS2_DIF_EXHASH)) { 1631 if (!(ip->i_diskflags & GFS2_DIF_EXHASH)) {
1632 error = dir_make_exhash(inode); 1632 error = dir_make_exhash(inode);
1633 if (error) 1633 if (error)
1634 break; 1634 break;
@@ -1691,7 +1691,7 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *name)
1691 } 1691 }
1692 1692
1693 dirent_del(dip, bh, prev, dent); 1693 dirent_del(dip, bh, prev, dent);
1694 if (dip->i_di.di_flags & GFS2_DIF_EXHASH) { 1694 if (dip->i_diskflags & GFS2_DIF_EXHASH) {
1695 struct gfs2_leaf *leaf = (struct gfs2_leaf *)bh->b_data; 1695 struct gfs2_leaf *leaf = (struct gfs2_leaf *)bh->b_data;
1696 u16 entries = be16_to_cpu(leaf->lf_entries); 1696 u16 entries = be16_to_cpu(leaf->lf_entries);
1697 if (!entries) 1697 if (!entries)
@@ -1704,10 +1704,10 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *name)
1704 if (error) 1704 if (error)
1705 return error; 1705 return error;
1706 1706
1707 if (!dip->i_di.di_entries) 1707 if (!dip->i_entries)
1708 gfs2_consist_inode(dip); 1708 gfs2_consist_inode(dip);
1709 gfs2_trans_add_bh(dip->i_gl, bh, 1); 1709 gfs2_trans_add_bh(dip->i_gl, bh, 1);
1710 dip->i_di.di_entries--; 1710 dip->i_entries--;
1711 dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME; 1711 dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME;
1712 gfs2_dinode_out(dip, bh->b_data); 1712 gfs2_dinode_out(dip, bh->b_data);
1713 brelse(bh); 1713 brelse(bh);
@@ -1748,7 +1748,7 @@ int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
1748 gfs2_inum_out(nip, dent); 1748 gfs2_inum_out(nip, dent);
1749 dent->de_type = cpu_to_be16(new_type); 1749 dent->de_type = cpu_to_be16(new_type);
1750 1750
1751 if (dip->i_di.di_flags & GFS2_DIF_EXHASH) { 1751 if (dip->i_diskflags & GFS2_DIF_EXHASH) {
1752 brelse(bh); 1752 brelse(bh);
1753 error = gfs2_meta_inode_buffer(dip, &bh); 1753 error = gfs2_meta_inode_buffer(dip, &bh);
1754 if (error) 1754 if (error)
@@ -1784,7 +1784,7 @@ static int foreach_leaf(struct gfs2_inode *dip, leaf_call_t lc, void *data)
1784 int error = 0; 1784 int error = 0;
1785 1785
1786 hsize = 1 << dip->i_depth; 1786 hsize = 1 << dip->i_depth;
1787 if (hsize * sizeof(u64) != dip->i_di.di_size) { 1787 if (hsize * sizeof(u64) != dip->i_disksize) {
1788 gfs2_consist_inode(dip); 1788 gfs2_consist_inode(dip);
1789 return -EIO; 1789 return -EIO;
1790 } 1790 }
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
index 8a468cac9328..4f919440c3be 100644
--- a/fs/gfs2/dir.h
+++ b/fs/gfs2/dir.h
@@ -11,6 +11,7 @@
11#define __DIR_DOT_H__ 11#define __DIR_DOT_H__
12 12
13#include <linux/dcache.h> 13#include <linux/dcache.h>
14#include <linux/crc32.h>
14 15
15struct inode; 16struct inode;
16struct gfs2_inode; 17struct gfs2_inode;
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
index e3f76f451b0a..0d1c76d906ae 100644
--- a/fs/gfs2/eattr.c
+++ b/fs/gfs2/eattr.c
@@ -114,11 +114,11 @@ static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data)
114 __be64 *eablk, *end; 114 __be64 *eablk, *end;
115 int error; 115 int error;
116 116
117 error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr, DIO_WAIT, &bh); 117 error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, &bh);
118 if (error) 118 if (error)
119 return error; 119 return error;
120 120
121 if (!(ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT)) { 121 if (!(ip->i_diskflags & GFS2_DIF_EA_INDIRECT)) {
122 error = ea_foreach_i(ip, bh, ea_call, data); 122 error = ea_foreach_i(ip, bh, ea_call, data);
123 goto out; 123 goto out;
124 } 124 }
@@ -414,7 +414,7 @@ int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er)
414 if (error) 414 if (error)
415 return error; 415 return error;
416 416
417 if (ip->i_di.di_eattr) { 417 if (ip->i_eattr) {
418 struct ea_list ei = { .ei_er = er, .ei_size = 0 }; 418 struct ea_list ei = { .ei_er = er, .ei_size = 0 };
419 419
420 error = ea_foreach(ip, ea_list_i, &ei); 420 error = ea_foreach(ip, ea_list_i, &ei);
@@ -514,7 +514,7 @@ int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
514 struct gfs2_ea_location el; 514 struct gfs2_ea_location el;
515 int error; 515 int error;
516 516
517 if (!ip->i_di.di_eattr) 517 if (!ip->i_eattr)
518 return -ENODATA; 518 return -ENODATA;
519 519
520 error = gfs2_ea_find(ip, er, &el); 520 error = gfs2_ea_find(ip, er, &el);
@@ -741,7 +741,7 @@ static int ea_init_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
741 if (error) 741 if (error)
742 return error; 742 return error;
743 743
744 ip->i_di.di_eattr = bh->b_blocknr; 744 ip->i_eattr = bh->b_blocknr;
745 error = ea_write(ip, GFS2_EA_BH2FIRST(bh), er); 745 error = ea_write(ip, GFS2_EA_BH2FIRST(bh), er);
746 746
747 brelse(bh); 747 brelse(bh);
@@ -935,10 +935,10 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
935 int error; 935 int error;
936 int mh_size = sizeof(struct gfs2_meta_header); 936 int mh_size = sizeof(struct gfs2_meta_header);
937 937
938 if (ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT) { 938 if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) {
939 __be64 *end; 939 __be64 *end;
940 940
941 error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr, DIO_WAIT, 941 error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT,
942 &indbh); 942 &indbh);
943 if (error) 943 if (error)
944 return error; 944 return error;
@@ -972,9 +972,9 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
972 gfs2_buffer_clear_tail(indbh, mh_size); 972 gfs2_buffer_clear_tail(indbh, mh_size);
973 973
974 eablk = (__be64 *)(indbh->b_data + mh_size); 974 eablk = (__be64 *)(indbh->b_data + mh_size);
975 *eablk = cpu_to_be64(ip->i_di.di_eattr); 975 *eablk = cpu_to_be64(ip->i_eattr);
976 ip->i_di.di_eattr = blk; 976 ip->i_eattr = blk;
977 ip->i_di.di_flags |= GFS2_DIF_EA_INDIRECT; 977 ip->i_diskflags |= GFS2_DIF_EA_INDIRECT;
978 gfs2_add_inode_blocks(&ip->i_inode, 1); 978 gfs2_add_inode_blocks(&ip->i_inode, 1);
979 979
980 eablk++; 980 eablk++;
@@ -1015,7 +1015,7 @@ static int ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
1015 if (error) 1015 if (error)
1016 return error; 1016 return error;
1017 1017
1018 if (!(ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT)) 1018 if (!(ip->i_diskflags & GFS2_DIF_EA_INDIRECT))
1019 blks++; 1019 blks++;
1020 if (GFS2_EAREQ_SIZE_STUFFED(er) > GFS2_SB(&ip->i_inode)->sd_jbsize) 1020 if (GFS2_EAREQ_SIZE_STUFFED(er) > GFS2_SB(&ip->i_inode)->sd_jbsize)
1021 blks += DIV_ROUND_UP(er->er_data_len, GFS2_SB(&ip->i_inode)->sd_jbsize); 1021 blks += DIV_ROUND_UP(er->er_data_len, GFS2_SB(&ip->i_inode)->sd_jbsize);
@@ -1040,7 +1040,7 @@ int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
1040 struct gfs2_ea_location el; 1040 struct gfs2_ea_location el;
1041 int error; 1041 int error;
1042 1042
1043 if (!ip->i_di.di_eattr) { 1043 if (!ip->i_eattr) {
1044 if (er->er_flags & XATTR_REPLACE) 1044 if (er->er_flags & XATTR_REPLACE)
1045 return -ENODATA; 1045 return -ENODATA;
1046 return ea_init(ip, er); 1046 return ea_init(ip, er);
@@ -1051,7 +1051,7 @@ int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
1051 return error; 1051 return error;
1052 1052
1053 if (el.el_ea) { 1053 if (el.el_ea) {
1054 if (ip->i_di.di_flags & GFS2_DIF_APPENDONLY) { 1054 if (ip->i_diskflags & GFS2_DIF_APPENDONLY) {
1055 brelse(el.el_bh); 1055 brelse(el.el_bh);
1056 return -EPERM; 1056 return -EPERM;
1057 } 1057 }
@@ -1145,7 +1145,7 @@ int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
1145 struct gfs2_ea_location el; 1145 struct gfs2_ea_location el;
1146 int error; 1146 int error;
1147 1147
1148 if (!ip->i_di.di_eattr) 1148 if (!ip->i_eattr)
1149 return -ENODATA; 1149 return -ENODATA;
1150 1150
1151 error = gfs2_ea_find(ip, er, &el); 1151 error = gfs2_ea_find(ip, er, &el);
@@ -1309,7 +1309,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)
1309 1309
1310 memset(&rlist, 0, sizeof(struct gfs2_rgrp_list)); 1310 memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
1311 1311
1312 error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr, DIO_WAIT, &indbh); 1312 error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, &indbh);
1313 if (error) 1313 if (error)
1314 return error; 1314 return error;
1315 1315
@@ -1388,7 +1388,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)
1388 if (bstart) 1388 if (bstart)
1389 gfs2_free_meta(ip, bstart, blen); 1389 gfs2_free_meta(ip, bstart, blen);
1390 1390
1391 ip->i_di.di_flags &= ~GFS2_DIF_EA_INDIRECT; 1391 ip->i_diskflags &= ~GFS2_DIF_EA_INDIRECT;
1392 1392
1393 error = gfs2_meta_inode_buffer(ip, &dibh); 1393 error = gfs2_meta_inode_buffer(ip, &dibh);
1394 if (!error) { 1394 if (!error) {
@@ -1416,7 +1416,7 @@ static int ea_dealloc_block(struct gfs2_inode *ip)
1416 struct buffer_head *dibh; 1416 struct buffer_head *dibh;
1417 int error; 1417 int error;
1418 1418
1419 rgd = gfs2_blk2rgrpd(sdp, ip->i_di.di_eattr); 1419 rgd = gfs2_blk2rgrpd(sdp, ip->i_eattr);
1420 if (!rgd) { 1420 if (!rgd) {
1421 gfs2_consist_inode(ip); 1421 gfs2_consist_inode(ip);
1422 return -EIO; 1422 return -EIO;
@@ -1432,9 +1432,9 @@ static int ea_dealloc_block(struct gfs2_inode *ip)
1432 if (error) 1432 if (error)
1433 goto out_gunlock; 1433 goto out_gunlock;
1434 1434
1435 gfs2_free_meta(ip, ip->i_di.di_eattr, 1); 1435 gfs2_free_meta(ip, ip->i_eattr, 1);
1436 1436
1437 ip->i_di.di_eattr = 0; 1437 ip->i_eattr = 0;
1438 gfs2_add_inode_blocks(&ip->i_inode, -1); 1438 gfs2_add_inode_blocks(&ip->i_inode, -1);
1439 1439
1440 error = gfs2_meta_inode_buffer(ip, &dibh); 1440 error = gfs2_meta_inode_buffer(ip, &dibh);
@@ -1479,7 +1479,7 @@ int gfs2_ea_dealloc(struct gfs2_inode *ip)
1479 if (error) 1479 if (error)
1480 goto out_rindex; 1480 goto out_rindex;
1481 1481
1482 if (ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT) { 1482 if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) {
1483 error = ea_dealloc_indirect(ip); 1483 error = ea_dealloc_indirect(ip);
1484 if (error) 1484 if (error)
1485 goto out_rindex; 1485 goto out_rindex;
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index c962283d4e7f..6b983aef785d 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -40,6 +40,7 @@
40#include "quota.h" 40#include "quota.h"
41#include "super.h" 41#include "super.h"
42#include "util.h" 42#include "util.h"
43#include "bmap.h"
43 44
44struct gfs2_gl_hash_bucket { 45struct gfs2_gl_hash_bucket {
45 struct hlist_head hb_list; 46 struct hlist_head hb_list;
@@ -61,9 +62,10 @@ static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int
61 62
62static DECLARE_RWSEM(gfs2_umount_flush_sem); 63static DECLARE_RWSEM(gfs2_umount_flush_sem);
63static struct dentry *gfs2_root; 64static struct dentry *gfs2_root;
64static struct task_struct *scand_process;
65static unsigned int scand_secs = 5;
66static struct workqueue_struct *glock_workqueue; 65static struct workqueue_struct *glock_workqueue;
66static LIST_HEAD(lru_list);
67static atomic_t lru_count = ATOMIC_INIT(0);
68static DEFINE_SPINLOCK(lru_lock);
67 69
68#define GFS2_GL_HASH_SHIFT 15 70#define GFS2_GL_HASH_SHIFT 15
69#define GFS2_GL_HASH_SIZE (1 << GFS2_GL_HASH_SHIFT) 71#define GFS2_GL_HASH_SIZE (1 << GFS2_GL_HASH_SHIFT)
@@ -174,6 +176,22 @@ static void gfs2_glock_hold(struct gfs2_glock *gl)
174} 176}
175 177
176/** 178/**
179 * gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list
180 * @gl: the glock
181 *
182 */
183
184static void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
185{
186 spin_lock(&lru_lock);
187 if (list_empty(&gl->gl_lru) && gl->gl_state != LM_ST_UNLOCKED) {
188 list_add_tail(&gl->gl_lru, &lru_list);
189 atomic_inc(&lru_count);
190 }
191 spin_unlock(&lru_lock);
192}
193
194/**
177 * gfs2_glock_put() - Decrement reference count on glock 195 * gfs2_glock_put() - Decrement reference count on glock
178 * @gl: The glock to put 196 * @gl: The glock to put
179 * 197 *
@@ -187,14 +205,23 @@ int gfs2_glock_put(struct gfs2_glock *gl)
187 if (atomic_dec_and_test(&gl->gl_ref)) { 205 if (atomic_dec_and_test(&gl->gl_ref)) {
188 hlist_del(&gl->gl_list); 206 hlist_del(&gl->gl_list);
189 write_unlock(gl_lock_addr(gl->gl_hash)); 207 write_unlock(gl_lock_addr(gl->gl_hash));
208 spin_lock(&lru_lock);
209 if (!list_empty(&gl->gl_lru)) {
210 list_del_init(&gl->gl_lru);
211 atomic_dec(&lru_count);
212 }
213 spin_unlock(&lru_lock);
190 GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_UNLOCKED); 214 GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_UNLOCKED);
191 GLOCK_BUG_ON(gl, !list_empty(&gl->gl_reclaim)); 215 GLOCK_BUG_ON(gl, !list_empty(&gl->gl_lru));
192 GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders)); 216 GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
193 glock_free(gl); 217 glock_free(gl);
194 rv = 1; 218 rv = 1;
195 goto out; 219 goto out;
196 } 220 }
197 write_unlock(gl_lock_addr(gl->gl_hash)); 221 write_unlock(gl_lock_addr(gl->gl_hash));
222 /* 1 for being hashed, 1 for having state != LM_ST_UNLOCKED */
223 if (atomic_read(&gl->gl_ref) == 2)
224 gfs2_glock_schedule_for_reclaim(gl);
198out: 225out:
199 return rv; 226 return rv;
200} 227}
@@ -289,10 +316,13 @@ static void gfs2_holder_wake(struct gfs2_holder *gh)
289 * do_promote - promote as many requests as possible on the current queue 316 * do_promote - promote as many requests as possible on the current queue
290 * @gl: The glock 317 * @gl: The glock
291 * 318 *
292 * Returns: true if there is a blocked holder at the head of the list 319 * Returns: 1 if there is a blocked holder at the head of the list, or 2
320 * if a type specific operation is underway.
293 */ 321 */
294 322
295static int do_promote(struct gfs2_glock *gl) 323static int do_promote(struct gfs2_glock *gl)
324__releases(&gl->gl_spin)
325__acquires(&gl->gl_spin)
296{ 326{
297 const struct gfs2_glock_operations *glops = gl->gl_ops; 327 const struct gfs2_glock_operations *glops = gl->gl_ops;
298 struct gfs2_holder *gh, *tmp; 328 struct gfs2_holder *gh, *tmp;
@@ -310,6 +340,8 @@ restart:
310 ret = glops->go_lock(gh); 340 ret = glops->go_lock(gh);
311 spin_lock(&gl->gl_spin); 341 spin_lock(&gl->gl_spin);
312 if (ret) { 342 if (ret) {
343 if (ret == 1)
344 return 2;
313 gh->gh_error = ret; 345 gh->gh_error = ret;
314 list_del_init(&gh->gh_list); 346 list_del_init(&gh->gh_list);
315 gfs2_holder_wake(gh); 347 gfs2_holder_wake(gh);
@@ -414,6 +446,7 @@ static void finish_xmote(struct gfs2_glock *gl, unsigned int ret)
414 const struct gfs2_glock_operations *glops = gl->gl_ops; 446 const struct gfs2_glock_operations *glops = gl->gl_ops;
415 struct gfs2_holder *gh; 447 struct gfs2_holder *gh;
416 unsigned state = ret & LM_OUT_ST_MASK; 448 unsigned state = ret & LM_OUT_ST_MASK;
449 int rv;
417 450
418 spin_lock(&gl->gl_spin); 451 spin_lock(&gl->gl_spin);
419 state_change(gl, state); 452 state_change(gl, state);
@@ -468,7 +501,6 @@ retry:
468 gfs2_demote_wake(gl); 501 gfs2_demote_wake(gl);
469 if (state != LM_ST_UNLOCKED) { 502 if (state != LM_ST_UNLOCKED) {
470 if (glops->go_xmote_bh) { 503 if (glops->go_xmote_bh) {
471 int rv;
472 spin_unlock(&gl->gl_spin); 504 spin_unlock(&gl->gl_spin);
473 rv = glops->go_xmote_bh(gl, gh); 505 rv = glops->go_xmote_bh(gl, gh);
474 if (rv == -EAGAIN) 506 if (rv == -EAGAIN)
@@ -479,10 +511,13 @@ retry:
479 goto out; 511 goto out;
480 } 512 }
481 } 513 }
482 do_promote(gl); 514 rv = do_promote(gl);
515 if (rv == 2)
516 goto out_locked;
483 } 517 }
484out: 518out:
485 clear_bit(GLF_LOCK, &gl->gl_flags); 519 clear_bit(GLF_LOCK, &gl->gl_flags);
520out_locked:
486 spin_unlock(&gl->gl_spin); 521 spin_unlock(&gl->gl_spin);
487 gfs2_glock_put(gl); 522 gfs2_glock_put(gl);
488} 523}
@@ -511,6 +546,8 @@ static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
511 */ 546 */
512 547
513static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target) 548static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target)
549__releases(&gl->gl_spin)
550__acquires(&gl->gl_spin)
514{ 551{
515 const struct gfs2_glock_operations *glops = gl->gl_ops; 552 const struct gfs2_glock_operations *glops = gl->gl_ops;
516 struct gfs2_sbd *sdp = gl->gl_sbd; 553 struct gfs2_sbd *sdp = gl->gl_sbd;
@@ -576,8 +613,11 @@ static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl)
576 */ 613 */
577 614
578static void run_queue(struct gfs2_glock *gl, const int nonblock) 615static void run_queue(struct gfs2_glock *gl, const int nonblock)
616__releases(&gl->gl_spin)
617__acquires(&gl->gl_spin)
579{ 618{
580 struct gfs2_holder *gh = NULL; 619 struct gfs2_holder *gh = NULL;
620 int ret;
581 621
582 if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) 622 if (test_and_set_bit(GLF_LOCK, &gl->gl_flags))
583 return; 623 return;
@@ -596,8 +636,11 @@ static void run_queue(struct gfs2_glock *gl, const int nonblock)
596 } else { 636 } else {
597 if (test_bit(GLF_DEMOTE, &gl->gl_flags)) 637 if (test_bit(GLF_DEMOTE, &gl->gl_flags))
598 gfs2_demote_wake(gl); 638 gfs2_demote_wake(gl);
599 if (do_promote(gl) == 0) 639 ret = do_promote(gl);
640 if (ret == 0)
600 goto out; 641 goto out;
642 if (ret == 2)
643 return;
601 gh = find_first_waiter(gl); 644 gh = find_first_waiter(gl);
602 gl->gl_target = gh->gh_state; 645 gl->gl_target = gh->gh_state;
603 if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) 646 if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
@@ -820,7 +863,7 @@ static void wait_on_demote(struct gfs2_glock *gl)
820 */ 863 */
821 864
822static void handle_callback(struct gfs2_glock *gl, unsigned int state, 865static void handle_callback(struct gfs2_glock *gl, unsigned int state,
823 int remote, unsigned long delay) 866 unsigned long delay)
824{ 867{
825 int bit = delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE; 868 int bit = delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE;
826 869
@@ -828,9 +871,6 @@ static void handle_callback(struct gfs2_glock *gl, unsigned int state,
828 if (gl->gl_demote_state == LM_ST_EXCLUSIVE) { 871 if (gl->gl_demote_state == LM_ST_EXCLUSIVE) {
829 gl->gl_demote_state = state; 872 gl->gl_demote_state = state;
830 gl->gl_demote_time = jiffies; 873 gl->gl_demote_time = jiffies;
831 if (remote && gl->gl_ops->go_type == LM_TYPE_IOPEN &&
832 gl->gl_object)
833 gfs2_glock_schedule_for_reclaim(gl);
834 } else if (gl->gl_demote_state != LM_ST_UNLOCKED && 874 } else if (gl->gl_demote_state != LM_ST_UNLOCKED &&
835 gl->gl_demote_state != state) { 875 gl->gl_demote_state != state) {
836 gl->gl_demote_state = LM_ST_UNLOCKED; 876 gl->gl_demote_state = LM_ST_UNLOCKED;
@@ -877,6 +917,8 @@ void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...)
877 */ 917 */
878 918
879static inline void add_to_queue(struct gfs2_holder *gh) 919static inline void add_to_queue(struct gfs2_holder *gh)
920__releases(&gl->gl_spin)
921__acquires(&gl->gl_spin)
880{ 922{
881 struct gfs2_glock *gl = gh->gh_gl; 923 struct gfs2_glock *gl = gh->gh_gl;
882 struct gfs2_sbd *sdp = gl->gl_sbd; 924 struct gfs2_sbd *sdp = gl->gl_sbd;
@@ -998,7 +1040,7 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
998 1040
999 spin_lock(&gl->gl_spin); 1041 spin_lock(&gl->gl_spin);
1000 if (gh->gh_flags & GL_NOCACHE) 1042 if (gh->gh_flags & GL_NOCACHE)
1001 handle_callback(gl, LM_ST_UNLOCKED, 0, 0); 1043 handle_callback(gl, LM_ST_UNLOCKED, 0);
1002 1044
1003 list_del_init(&gh->gh_list); 1045 list_del_init(&gh->gh_list);
1004 if (find_first_holder(gl) == NULL) { 1046 if (find_first_holder(gl) == NULL) {
@@ -1269,12 +1311,26 @@ static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name,
1269 delay = gl->gl_ops->go_min_hold_time; 1311 delay = gl->gl_ops->go_min_hold_time;
1270 1312
1271 spin_lock(&gl->gl_spin); 1313 spin_lock(&gl->gl_spin);
1272 handle_callback(gl, state, 1, delay); 1314 handle_callback(gl, state, delay);
1273 spin_unlock(&gl->gl_spin); 1315 spin_unlock(&gl->gl_spin);
1274 if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0) 1316 if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
1275 gfs2_glock_put(gl); 1317 gfs2_glock_put(gl);
1276} 1318}
1277 1319
1320static void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid)
1321{
1322 struct gfs2_jdesc *jd;
1323
1324 spin_lock(&sdp->sd_jindex_spin);
1325 list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
1326 if (jd->jd_jid != jid)
1327 continue;
1328 jd->jd_dirty = 1;
1329 break;
1330 }
1331 spin_unlock(&sdp->sd_jindex_spin);
1332}
1333
1278/** 1334/**
1279 * gfs2_glock_cb - Callback used by locking module 1335 * gfs2_glock_cb - Callback used by locking module
1280 * @sdp: Pointer to the superblock 1336 * @sdp: Pointer to the superblock
@@ -1338,80 +1394,83 @@ void gfs2_glock_cb(void *cb_data, unsigned int type, void *data)
1338 * Returns: 1 if it's ok 1394 * Returns: 1 if it's ok
1339 */ 1395 */
1340 1396
1341static int demote_ok(struct gfs2_glock *gl) 1397static int demote_ok(const struct gfs2_glock *gl)
1342{ 1398{
1343 const struct gfs2_glock_operations *glops = gl->gl_ops; 1399 const struct gfs2_glock_operations *glops = gl->gl_ops;
1344 int demote = 1;
1345
1346 if (test_bit(GLF_STICKY, &gl->gl_flags))
1347 demote = 0;
1348 else if (glops->go_demote_ok)
1349 demote = glops->go_demote_ok(gl);
1350
1351 return demote;
1352}
1353
1354/**
1355 * gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list
1356 * @gl: the glock
1357 *
1358 */
1359
1360void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
1361{
1362 struct gfs2_sbd *sdp = gl->gl_sbd;
1363 1400
1364 spin_lock(&sdp->sd_reclaim_lock); 1401 if (gl->gl_state == LM_ST_UNLOCKED)
1365 if (list_empty(&gl->gl_reclaim)) { 1402 return 0;
1366 gfs2_glock_hold(gl); 1403 if (!list_empty(&gl->gl_holders))
1367 list_add(&gl->gl_reclaim, &sdp->sd_reclaim_list); 1404 return 0;
1368 atomic_inc(&sdp->sd_reclaim_count); 1405 if (glops->go_demote_ok)
1369 spin_unlock(&sdp->sd_reclaim_lock); 1406 return glops->go_demote_ok(gl);
1370 wake_up(&sdp->sd_reclaim_wq); 1407 return 1;
1371 } else
1372 spin_unlock(&sdp->sd_reclaim_lock);
1373} 1408}
1374 1409
1375/**
1376 * gfs2_reclaim_glock - process the next glock on the filesystem's reclaim list
1377 * @sdp: the filesystem
1378 *
1379 * Called from gfs2_glockd() glock reclaim daemon, or when promoting a
1380 * different glock and we notice that there are a lot of glocks in the
1381 * reclaim list.
1382 *
1383 */
1384 1410
1385void gfs2_reclaim_glock(struct gfs2_sbd *sdp) 1411static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask)
1386{ 1412{
1387 struct gfs2_glock *gl; 1413 struct gfs2_glock *gl;
1388 int done_callback = 0; 1414 int may_demote;
1415 int nr_skipped = 0;
1416 int got_ref = 0;
1417 LIST_HEAD(skipped);
1389 1418
1390 spin_lock(&sdp->sd_reclaim_lock); 1419 if (nr == 0)
1391 if (list_empty(&sdp->sd_reclaim_list)) { 1420 goto out;
1392 spin_unlock(&sdp->sd_reclaim_lock);
1393 return;
1394 }
1395 gl = list_entry(sdp->sd_reclaim_list.next,
1396 struct gfs2_glock, gl_reclaim);
1397 list_del_init(&gl->gl_reclaim);
1398 spin_unlock(&sdp->sd_reclaim_lock);
1399 1421
1400 atomic_dec(&sdp->sd_reclaim_count); 1422 if (!(gfp_mask & __GFP_FS))
1401 atomic_inc(&sdp->sd_reclaimed); 1423 return -1;
1402 1424
1403 spin_lock(&gl->gl_spin); 1425 spin_lock(&lru_lock);
1404 if (find_first_holder(gl) == NULL && 1426 while(nr && !list_empty(&lru_list)) {
1405 gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl)) { 1427 gl = list_entry(lru_list.next, struct gfs2_glock, gl_lru);
1406 handle_callback(gl, LM_ST_UNLOCKED, 0, 0); 1428 list_del_init(&gl->gl_lru);
1407 done_callback = 1; 1429 atomic_dec(&lru_count);
1430
1431 /* Test for being demotable */
1432 if (!test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
1433 gfs2_glock_hold(gl);
1434 got_ref = 1;
1435 spin_unlock(&lru_lock);
1436 spin_lock(&gl->gl_spin);
1437 may_demote = demote_ok(gl);
1438 spin_unlock(&gl->gl_spin);
1439 clear_bit(GLF_LOCK, &gl->gl_flags);
1440 if (may_demote) {
1441 handle_callback(gl, LM_ST_UNLOCKED, 0);
1442 nr--;
1443 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
1444 gfs2_glock_put(gl);
1445 }
1446 spin_lock(&lru_lock);
1447 if (may_demote)
1448 continue;
1449 }
1450 if (list_empty(&gl->gl_lru) &&
1451 (atomic_read(&gl->gl_ref) <= (2 + got_ref))) {
1452 nr_skipped++;
1453 list_add(&gl->gl_lru, &skipped);
1454 }
1455 if (got_ref) {
1456 spin_unlock(&lru_lock);
1457 gfs2_glock_put(gl);
1458 spin_lock(&lru_lock);
1459 got_ref = 0;
1460 }
1408 } 1461 }
1409 spin_unlock(&gl->gl_spin); 1462 list_splice(&skipped, &lru_list);
1410 if (!done_callback || 1463 atomic_add(nr_skipped, &lru_count);
1411 queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) 1464 spin_unlock(&lru_lock);
1412 gfs2_glock_put(gl); 1465out:
1466 return (atomic_read(&lru_count) / 100) * sysctl_vfs_cache_pressure;
1413} 1467}
1414 1468
1469static struct shrinker glock_shrinker = {
1470 .shrink = gfs2_shrink_glock_memory,
1471 .seeks = DEFAULT_SEEKS,
1472};
1473
1415/** 1474/**
1416 * examine_bucket - Call a function for glock in a hash bucket 1475 * examine_bucket - Call a function for glock in a hash bucket
1417 * @examiner: the function 1476 * @examiner: the function
@@ -1457,26 +1516,6 @@ out:
1457} 1516}
1458 1517
1459/** 1518/**
1460 * scan_glock - look at a glock and see if we can reclaim it
1461 * @gl: the glock to look at
1462 *
1463 */
1464
1465static void scan_glock(struct gfs2_glock *gl)
1466{
1467 if (gl->gl_ops == &gfs2_inode_glops && gl->gl_object)
1468 return;
1469 if (test_bit(GLF_LOCK, &gl->gl_flags))
1470 return;
1471
1472 spin_lock(&gl->gl_spin);
1473 if (find_first_holder(gl) == NULL &&
1474 gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl))
1475 gfs2_glock_schedule_for_reclaim(gl);
1476 spin_unlock(&gl->gl_spin);
1477}
1478
1479/**
1480 * clear_glock - look at a glock and see if we can free it from glock cache 1519 * clear_glock - look at a glock and see if we can free it from glock cache
1481 * @gl: the glock to look at 1520 * @gl: the glock to look at
1482 * 1521 *
@@ -1484,23 +1523,16 @@ static void scan_glock(struct gfs2_glock *gl)
1484 1523
1485static void clear_glock(struct gfs2_glock *gl) 1524static void clear_glock(struct gfs2_glock *gl)
1486{ 1525{
1487 struct gfs2_sbd *sdp = gl->gl_sbd; 1526 spin_lock(&lru_lock);
1488 int released; 1527 if (!list_empty(&gl->gl_lru)) {
1489 1528 list_del_init(&gl->gl_lru);
1490 spin_lock(&sdp->sd_reclaim_lock); 1529 atomic_dec(&lru_count);
1491 if (!list_empty(&gl->gl_reclaim)) {
1492 list_del_init(&gl->gl_reclaim);
1493 atomic_dec(&sdp->sd_reclaim_count);
1494 spin_unlock(&sdp->sd_reclaim_lock);
1495 released = gfs2_glock_put(gl);
1496 gfs2_assert(sdp, !released);
1497 } else {
1498 spin_unlock(&sdp->sd_reclaim_lock);
1499 } 1530 }
1531 spin_unlock(&lru_lock);
1500 1532
1501 spin_lock(&gl->gl_spin); 1533 spin_lock(&gl->gl_spin);
1502 if (find_first_holder(gl) == NULL && gl->gl_state != LM_ST_UNLOCKED) 1534 if (find_first_holder(gl) == NULL && gl->gl_state != LM_ST_UNLOCKED)
1503 handle_callback(gl, LM_ST_UNLOCKED, 0, 0); 1535 handle_callback(gl, LM_ST_UNLOCKED, 0);
1504 spin_unlock(&gl->gl_spin); 1536 spin_unlock(&gl->gl_spin);
1505 gfs2_glock_hold(gl); 1537 gfs2_glock_hold(gl);
1506 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) 1538 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
@@ -1548,6 +1580,20 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp)
1548 } 1580 }
1549} 1581}
1550 1582
1583void gfs2_glock_finish_truncate(struct gfs2_inode *ip)
1584{
1585 struct gfs2_glock *gl = ip->i_gl;
1586 int ret;
1587
1588 ret = gfs2_truncatei_resume(ip);
1589 gfs2_assert_withdraw(gl->gl_sbd, ret == 0);
1590
1591 spin_lock(&gl->gl_spin);
1592 clear_bit(GLF_LOCK, &gl->gl_flags);
1593 run_queue(gl, 1);
1594 spin_unlock(&gl->gl_spin);
1595}
1596
1551static const char *state2str(unsigned state) 1597static const char *state2str(unsigned state)
1552{ 1598{
1553 switch(state) { 1599 switch(state) {
@@ -1623,8 +1669,6 @@ static const char *gflags2str(char *buf, const unsigned long *gflags)
1623 char *p = buf; 1669 char *p = buf;
1624 if (test_bit(GLF_LOCK, gflags)) 1670 if (test_bit(GLF_LOCK, gflags))
1625 *p++ = 'l'; 1671 *p++ = 'l';
1626 if (test_bit(GLF_STICKY, gflags))
1627 *p++ = 's';
1628 if (test_bit(GLF_DEMOTE, gflags)) 1672 if (test_bit(GLF_DEMOTE, gflags))
1629 *p++ = 'D'; 1673 *p++ = 'D';
1630 if (test_bit(GLF_PENDING_DEMOTE, gflags)) 1674 if (test_bit(GLF_PENDING_DEMOTE, gflags))
@@ -1743,34 +1787,6 @@ static int gfs2_dump_lockstate(struct gfs2_sbd *sdp)
1743 return error; 1787 return error;
1744} 1788}
1745 1789
1746/**
1747 * gfs2_scand - Look for cached glocks and inodes to toss from memory
1748 * @sdp: Pointer to GFS2 superblock
1749 *
1750 * One of these daemons runs, finding candidates to add to sd_reclaim_list.
1751 * See gfs2_glockd()
1752 */
1753
1754static int gfs2_scand(void *data)
1755{
1756 unsigned x;
1757 unsigned delay;
1758
1759 while (!kthread_should_stop()) {
1760 for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
1761 examine_bucket(scan_glock, NULL, x);
1762 if (freezing(current))
1763 refrigerator();
1764 delay = scand_secs;
1765 if (delay < 1)
1766 delay = 1;
1767 schedule_timeout_interruptible(delay * HZ);
1768 }
1769
1770 return 0;
1771}
1772
1773
1774 1790
1775int __init gfs2_glock_init(void) 1791int __init gfs2_glock_init(void)
1776{ 1792{
@@ -1784,28 +1800,21 @@ int __init gfs2_glock_init(void)
1784 } 1800 }
1785#endif 1801#endif
1786 1802
1787 scand_process = kthread_run(gfs2_scand, NULL, "gfs2_scand");
1788 if (IS_ERR(scand_process))
1789 return PTR_ERR(scand_process);
1790
1791 glock_workqueue = create_workqueue("glock_workqueue"); 1803 glock_workqueue = create_workqueue("glock_workqueue");
1792 if (IS_ERR(glock_workqueue)) { 1804 if (IS_ERR(glock_workqueue))
1793 kthread_stop(scand_process);
1794 return PTR_ERR(glock_workqueue); 1805 return PTR_ERR(glock_workqueue);
1795 } 1806
1807 register_shrinker(&glock_shrinker);
1796 1808
1797 return 0; 1809 return 0;
1798} 1810}
1799 1811
1800void gfs2_glock_exit(void) 1812void gfs2_glock_exit(void)
1801{ 1813{
1814 unregister_shrinker(&glock_shrinker);
1802 destroy_workqueue(glock_workqueue); 1815 destroy_workqueue(glock_workqueue);
1803 kthread_stop(scand_process);
1804} 1816}
1805 1817
1806module_param(scand_secs, uint, S_IRUGO|S_IWUSR);
1807MODULE_PARM_DESC(scand_secs, "The number of seconds between scand runs");
1808
1809static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi) 1818static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi)
1810{ 1819{
1811 struct gfs2_glock *gl; 1820 struct gfs2_glock *gl;
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 695c6b193611..543ec7ecfbda 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -129,9 +129,9 @@ int gfs2_lvb_hold(struct gfs2_glock *gl);
129void gfs2_lvb_unhold(struct gfs2_glock *gl); 129void gfs2_lvb_unhold(struct gfs2_glock *gl);
130 130
131void gfs2_glock_cb(void *cb_data, unsigned int type, void *data); 131void gfs2_glock_cb(void *cb_data, unsigned int type, void *data);
132void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl);
133void gfs2_reclaim_glock(struct gfs2_sbd *sdp); 132void gfs2_reclaim_glock(struct gfs2_sbd *sdp);
134void gfs2_gl_hash_clear(struct gfs2_sbd *sdp); 133void gfs2_gl_hash_clear(struct gfs2_sbd *sdp);
134void gfs2_glock_finish_truncate(struct gfs2_inode *ip);
135 135
136int __init gfs2_glock_init(void); 136int __init gfs2_glock_init(void);
137void gfs2_glock_exit(void); 137void gfs2_glock_exit(void);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index c6c318c2a0f6..8522d3aa64fc 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -201,19 +201,12 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
201 * Returns: 1 if it's ok 201 * Returns: 1 if it's ok
202 */ 202 */
203 203
204static int inode_go_demote_ok(struct gfs2_glock *gl) 204static int inode_go_demote_ok(const struct gfs2_glock *gl)
205{ 205{
206 struct gfs2_sbd *sdp = gl->gl_sbd; 206 struct gfs2_sbd *sdp = gl->gl_sbd;
207 int demote = 0; 207 if (sdp->sd_jindex == gl->gl_object || sdp->sd_rindex == gl->gl_object)
208 208 return 0;
209 if (!gl->gl_object && !gl->gl_aspace->i_mapping->nrpages) 209 return 1;
210 demote = 1;
211 else if (!sdp->sd_args.ar_localcaching &&
212 time_after_eq(jiffies, gl->gl_stamp +
213 gfs2_tune_get(sdp, gt_demote_secs) * HZ))
214 demote = 1;
215
216 return demote;
217} 210}
218 211
219/** 212/**
@@ -227,6 +220,7 @@ static int inode_go_demote_ok(struct gfs2_glock *gl)
227static int inode_go_lock(struct gfs2_holder *gh) 220static int inode_go_lock(struct gfs2_holder *gh)
228{ 221{
229 struct gfs2_glock *gl = gh->gh_gl; 222 struct gfs2_glock *gl = gh->gh_gl;
223 struct gfs2_sbd *sdp = gl->gl_sbd;
230 struct gfs2_inode *ip = gl->gl_object; 224 struct gfs2_inode *ip = gl->gl_object;
231 int error = 0; 225 int error = 0;
232 226
@@ -239,10 +233,16 @@ static int inode_go_lock(struct gfs2_holder *gh)
239 return error; 233 return error;
240 } 234 }
241 235
242 if ((ip->i_di.di_flags & GFS2_DIF_TRUNC_IN_PROG) && 236 if ((ip->i_diskflags & GFS2_DIF_TRUNC_IN_PROG) &&
243 (gl->gl_state == LM_ST_EXCLUSIVE) && 237 (gl->gl_state == LM_ST_EXCLUSIVE) &&
244 (gh->gh_state == LM_ST_EXCLUSIVE)) 238 (gh->gh_state == LM_ST_EXCLUSIVE)) {
245 error = gfs2_truncatei_resume(ip); 239 spin_lock(&sdp->sd_trunc_lock);
240 if (list_empty(&ip->i_trunc_list))
241 list_add(&sdp->sd_trunc_list, &ip->i_trunc_list);
242 spin_unlock(&sdp->sd_trunc_lock);
243 wake_up(&sdp->sd_quota_wait);
244 return 1;
245 }
246 246
247 return error; 247 return error;
248} 248}
@@ -260,10 +260,13 @@ static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
260 const struct gfs2_inode *ip = gl->gl_object; 260 const struct gfs2_inode *ip = gl->gl_object;
261 if (ip == NULL) 261 if (ip == NULL)
262 return 0; 262 return 0;
263 gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%08lx\n", 263 gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu/%llu\n",
264 (unsigned long long)ip->i_no_formal_ino, 264 (unsigned long long)ip->i_no_formal_ino,
265 (unsigned long long)ip->i_no_addr, 265 (unsigned long long)ip->i_no_addr,
266 IF2DT(ip->i_inode.i_mode), ip->i_flags); 266 IF2DT(ip->i_inode.i_mode), ip->i_flags,
267 (unsigned int)ip->i_diskflags,
268 (unsigned long long)ip->i_inode.i_size,
269 (unsigned long long)ip->i_disksize);
267 return 0; 270 return 0;
268} 271}
269 272
@@ -274,7 +277,7 @@ static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
274 * Returns: 1 if it's ok 277 * Returns: 1 if it's ok
275 */ 278 */
276 279
277static int rgrp_go_demote_ok(struct gfs2_glock *gl) 280static int rgrp_go_demote_ok(const struct gfs2_glock *gl)
278{ 281{
279 return !gl->gl_aspace->i_mapping->nrpages; 282 return !gl->gl_aspace->i_mapping->nrpages;
280} 283}
@@ -318,7 +321,9 @@ static int rgrp_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
318 const struct gfs2_rgrpd *rgd = gl->gl_object; 321 const struct gfs2_rgrpd *rgd = gl->gl_object;
319 if (rgd == NULL) 322 if (rgd == NULL)
320 return 0; 323 return 0;
321 gfs2_print_dbg(seq, " R: n:%llu\n", (unsigned long long)rgd->rd_addr); 324 gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u\n",
325 (unsigned long long)rgd->rd_addr, rgd->rd_flags,
326 rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes);
322 return 0; 327 return 0;
323} 328}
324 329
@@ -374,13 +379,25 @@ static int trans_go_xmote_bh(struct gfs2_glock *gl, struct gfs2_holder *gh)
374} 379}
375 380
376/** 381/**
382 * trans_go_demote_ok
383 * @gl: the glock
384 *
385 * Always returns 0
386 */
387
388static int trans_go_demote_ok(const struct gfs2_glock *gl)
389{
390 return 0;
391}
392
393/**
377 * quota_go_demote_ok - Check to see if it's ok to unlock a quota glock 394 * quota_go_demote_ok - Check to see if it's ok to unlock a quota glock
378 * @gl: the glock 395 * @gl: the glock
379 * 396 *
380 * Returns: 1 if it's ok 397 * Returns: 1 if it's ok
381 */ 398 */
382 399
383static int quota_go_demote_ok(struct gfs2_glock *gl) 400static int quota_go_demote_ok(const struct gfs2_glock *gl)
384{ 401{
385 return !atomic_read(&gl->gl_lvb_count); 402 return !atomic_read(&gl->gl_lvb_count);
386} 403}
@@ -414,6 +431,7 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = {
414const struct gfs2_glock_operations gfs2_trans_glops = { 431const struct gfs2_glock_operations gfs2_trans_glops = {
415 .go_xmote_th = trans_go_sync, 432 .go_xmote_th = trans_go_sync,
416 .go_xmote_bh = trans_go_xmote_bh, 433 .go_xmote_bh = trans_go_xmote_bh,
434 .go_demote_ok = trans_go_demote_ok,
417 .go_type = LM_TYPE_NONDISK, 435 .go_type = LM_TYPE_NONDISK,
418}; 436};
419 437
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index f566ec1b4e8e..608849d00021 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -68,12 +68,6 @@ struct gfs2_bitmap {
68 u32 bi_len; 68 u32 bi_len;
69}; 69};
70 70
71struct gfs2_rgrp_host {
72 u32 rg_free;
73 u32 rg_dinodes;
74 u64 rg_igeneration;
75};
76
77struct gfs2_rgrpd { 71struct gfs2_rgrpd {
78 struct list_head rd_list; /* Link with superblock */ 72 struct list_head rd_list; /* Link with superblock */
79 struct list_head rd_list_mru; 73 struct list_head rd_list_mru;
@@ -83,14 +77,16 @@ struct gfs2_rgrpd {
83 u32 rd_length; /* length of rgrp header in fs blocks */ 77 u32 rd_length; /* length of rgrp header in fs blocks */
84 u32 rd_data; /* num of data blocks in rgrp */ 78 u32 rd_data; /* num of data blocks in rgrp */
85 u32 rd_bitbytes; /* number of bytes in data bitmaps */ 79 u32 rd_bitbytes; /* number of bytes in data bitmaps */
86 struct gfs2_rgrp_host rd_rg; 80 u32 rd_free;
81 u32 rd_free_clone;
82 u32 rd_dinodes;
83 u64 rd_igeneration;
87 struct gfs2_bitmap *rd_bits; 84 struct gfs2_bitmap *rd_bits;
88 unsigned int rd_bh_count;
89 struct mutex rd_mutex; 85 struct mutex rd_mutex;
90 u32 rd_free_clone;
91 struct gfs2_log_element rd_le; 86 struct gfs2_log_element rd_le;
92 u32 rd_last_alloc;
93 struct gfs2_sbd *rd_sbd; 87 struct gfs2_sbd *rd_sbd;
88 unsigned int rd_bh_count;
89 u32 rd_last_alloc;
94 unsigned char rd_flags; 90 unsigned char rd_flags;
95#define GFS2_RDF_CHECK 0x01 /* Need to check for unlinked inodes */ 91#define GFS2_RDF_CHECK 0x01 /* Need to check for unlinked inodes */
96#define GFS2_RDF_NOALLOC 0x02 /* rg prohibits allocation */ 92#define GFS2_RDF_NOALLOC 0x02 /* rg prohibits allocation */
@@ -129,7 +125,7 @@ struct gfs2_glock_operations {
129 void (*go_xmote_th) (struct gfs2_glock *gl); 125 void (*go_xmote_th) (struct gfs2_glock *gl);
130 int (*go_xmote_bh) (struct gfs2_glock *gl, struct gfs2_holder *gh); 126 int (*go_xmote_bh) (struct gfs2_glock *gl, struct gfs2_holder *gh);
131 void (*go_inval) (struct gfs2_glock *gl, int flags); 127 void (*go_inval) (struct gfs2_glock *gl, int flags);
132 int (*go_demote_ok) (struct gfs2_glock *gl); 128 int (*go_demote_ok) (const struct gfs2_glock *gl);
133 int (*go_lock) (struct gfs2_holder *gh); 129 int (*go_lock) (struct gfs2_holder *gh);
134 void (*go_unlock) (struct gfs2_holder *gh); 130 void (*go_unlock) (struct gfs2_holder *gh);
135 int (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl); 131 int (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl);
@@ -159,7 +155,6 @@ struct gfs2_holder {
159 155
160enum { 156enum {
161 GLF_LOCK = 1, 157 GLF_LOCK = 1,
162 GLF_STICKY = 2,
163 GLF_DEMOTE = 3, 158 GLF_DEMOTE = 3,
164 GLF_PENDING_DEMOTE = 4, 159 GLF_PENDING_DEMOTE = 4,
165 GLF_DEMOTE_IN_PROGRESS = 5, 160 GLF_DEMOTE_IN_PROGRESS = 5,
@@ -194,7 +189,7 @@ struct gfs2_glock {
194 unsigned long gl_tchange; 189 unsigned long gl_tchange;
195 void *gl_object; 190 void *gl_object;
196 191
197 struct list_head gl_reclaim; 192 struct list_head gl_lru;
198 193
199 struct gfs2_sbd *gl_sbd; 194 struct gfs2_sbd *gl_sbd;
200 195
@@ -233,29 +228,24 @@ enum {
233 GIF_USER = 4, /* user inode, not metadata addr space */ 228 GIF_USER = 4, /* user inode, not metadata addr space */
234}; 229};
235 230
236struct gfs2_dinode_host {
237 u64 di_size; /* number of bytes in file */
238 u64 di_generation; /* generation number for NFS */
239 u32 di_flags; /* GFS2_DIF_... */
240 /* These only apply to directories */
241 u32 di_entries; /* The number of entries in the directory */
242 u64 di_eattr; /* extended attribute block number */
243};
244 231
245struct gfs2_inode { 232struct gfs2_inode {
246 struct inode i_inode; 233 struct inode i_inode;
247 u64 i_no_addr; 234 u64 i_no_addr;
248 u64 i_no_formal_ino; 235 u64 i_no_formal_ino;
236 u64 i_generation;
237 u64 i_eattr;
238 loff_t i_disksize;
249 unsigned long i_flags; /* GIF_... */ 239 unsigned long i_flags; /* GIF_... */
250
251 struct gfs2_dinode_host i_di; /* To be replaced by ref to block */
252
253 struct gfs2_glock *i_gl; /* Move into i_gh? */ 240 struct gfs2_glock *i_gl; /* Move into i_gh? */
254 struct gfs2_holder i_iopen_gh; 241 struct gfs2_holder i_iopen_gh;
255 struct gfs2_holder i_gh; /* for prepare/commit_write only */ 242 struct gfs2_holder i_gh; /* for prepare/commit_write only */
256 struct gfs2_alloc *i_alloc; 243 struct gfs2_alloc *i_alloc;
257 u64 i_goal; /* goal block for allocations */ 244 u64 i_goal; /* goal block for allocations */
258 struct rw_semaphore i_rw_mutex; 245 struct rw_semaphore i_rw_mutex;
246 struct list_head i_trunc_list;
247 u32 i_entries;
248 u32 i_diskflags;
259 u8 i_height; 249 u8 i_height;
260 u8 i_depth; 250 u8 i_depth;
261}; 251};
@@ -406,13 +396,11 @@ struct gfs2_args {
406struct gfs2_tune { 396struct gfs2_tune {
407 spinlock_t gt_spin; 397 spinlock_t gt_spin;
408 398
409 unsigned int gt_demote_secs; /* Cache retention for unheld glock */
410 unsigned int gt_incore_log_blocks; 399 unsigned int gt_incore_log_blocks;
411 unsigned int gt_log_flush_secs; 400 unsigned int gt_log_flush_secs;
412 401
413 unsigned int gt_recoverd_secs; 402 unsigned int gt_recoverd_secs;
414 unsigned int gt_logd_secs; 403 unsigned int gt_logd_secs;
415 unsigned int gt_quotad_secs;
416 404
417 unsigned int gt_quota_simul_sync; /* Max quotavals to sync at once */ 405 unsigned int gt_quota_simul_sync; /* Max quotavals to sync at once */
418 unsigned int gt_quota_warn_period; /* Secs between quota warn msgs */ 406 unsigned int gt_quota_warn_period; /* Secs between quota warn msgs */
@@ -488,10 +476,6 @@ struct gfs2_sbd {
488 /* Lock Stuff */ 476 /* Lock Stuff */
489 477
490 struct lm_lockstruct sd_lockstruct; 478 struct lm_lockstruct sd_lockstruct;
491 struct list_head sd_reclaim_list;
492 spinlock_t sd_reclaim_lock;
493 wait_queue_head_t sd_reclaim_wq;
494 atomic_t sd_reclaim_count;
495 struct gfs2_holder sd_live_gh; 479 struct gfs2_holder sd_live_gh;
496 struct gfs2_glock *sd_rename_gl; 480 struct gfs2_glock *sd_rename_gl;
497 struct gfs2_glock *sd_trans_gl; 481 struct gfs2_glock *sd_trans_gl;
@@ -519,7 +503,6 @@ struct gfs2_sbd {
519 spinlock_t sd_statfs_spin; 503 spinlock_t sd_statfs_spin;
520 struct gfs2_statfs_change_host sd_statfs_master; 504 struct gfs2_statfs_change_host sd_statfs_master;
521 struct gfs2_statfs_change_host sd_statfs_local; 505 struct gfs2_statfs_change_host sd_statfs_local;
522 unsigned long sd_statfs_sync_time;
523 506
524 /* Resource group stuff */ 507 /* Resource group stuff */
525 508
@@ -552,8 +535,6 @@ struct gfs2_sbd {
552 struct task_struct *sd_recoverd_process; 535 struct task_struct *sd_recoverd_process;
553 struct task_struct *sd_logd_process; 536 struct task_struct *sd_logd_process;
554 struct task_struct *sd_quotad_process; 537 struct task_struct *sd_quotad_process;
555 struct task_struct *sd_glockd_process[GFS2_GLOCKD_MAX];
556 unsigned int sd_glockd_num;
557 538
558 /* Quota stuff */ 539 /* Quota stuff */
559 540
@@ -561,13 +542,15 @@ struct gfs2_sbd {
561 atomic_t sd_quota_count; 542 atomic_t sd_quota_count;
562 spinlock_t sd_quota_spin; 543 spinlock_t sd_quota_spin;
563 struct mutex sd_quota_mutex; 544 struct mutex sd_quota_mutex;
545 wait_queue_head_t sd_quota_wait;
546 struct list_head sd_trunc_list;
547 spinlock_t sd_trunc_lock;
564 548
565 unsigned int sd_quota_slots; 549 unsigned int sd_quota_slots;
566 unsigned int sd_quota_chunks; 550 unsigned int sd_quota_chunks;
567 unsigned char **sd_quota_bitmap; 551 unsigned char **sd_quota_bitmap;
568 552
569 u64 sd_quota_sync_gen; 553 u64 sd_quota_sync_gen;
570 unsigned long sd_quota_sync_time;
571 554
572 /* Log stuff */ 555 /* Log stuff */
573 556
@@ -624,10 +607,6 @@ struct gfs2_sbd {
624 struct mutex sd_freeze_lock; 607 struct mutex sd_freeze_lock;
625 unsigned int sd_freeze_count; 608 unsigned int sd_freeze_count;
626 609
627 /* Counters */
628
629 atomic_t sd_reclaimed;
630
631 char sd_fsname[GFS2_FSNAME_LEN]; 610 char sd_fsname[GFS2_FSNAME_LEN];
632 char sd_table_name[GFS2_FSNAME_LEN]; 611 char sd_table_name[GFS2_FSNAME_LEN];
633 char sd_proto_name[GFS2_FSNAME_LEN]; 612 char sd_proto_name[GFS2_FSNAME_LEN];
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index d57616840e89..3b87c188da41 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -32,7 +32,6 @@
32#include "log.h" 32#include "log.h"
33#include "meta_io.h" 33#include "meta_io.h"
34#include "ops_address.h" 34#include "ops_address.h"
35#include "ops_inode.h"
36#include "quota.h" 35#include "quota.h"
37#include "rgrp.h" 36#include "rgrp.h"
38#include "trans.h" 37#include "trans.h"
@@ -248,7 +247,6 @@ fail:
248 247
249static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) 248static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
250{ 249{
251 struct gfs2_dinode_host *di = &ip->i_di;
252 const struct gfs2_dinode *str = buf; 250 const struct gfs2_dinode *str = buf;
253 struct timespec atime; 251 struct timespec atime;
254 u16 height, depth; 252 u16 height, depth;
@@ -274,8 +272,8 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
274 * to do that. 272 * to do that.
275 */ 273 */
276 ip->i_inode.i_nlink = be32_to_cpu(str->di_nlink); 274 ip->i_inode.i_nlink = be32_to_cpu(str->di_nlink);
277 di->di_size = be64_to_cpu(str->di_size); 275 ip->i_disksize = be64_to_cpu(str->di_size);
278 i_size_write(&ip->i_inode, di->di_size); 276 i_size_write(&ip->i_inode, ip->i_disksize);
279 gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks)); 277 gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks));
280 atime.tv_sec = be64_to_cpu(str->di_atime); 278 atime.tv_sec = be64_to_cpu(str->di_atime);
281 atime.tv_nsec = be32_to_cpu(str->di_atime_nsec); 279 atime.tv_nsec = be32_to_cpu(str->di_atime_nsec);
@@ -287,9 +285,9 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
287 ip->i_inode.i_ctime.tv_nsec = be32_to_cpu(str->di_ctime_nsec); 285 ip->i_inode.i_ctime.tv_nsec = be32_to_cpu(str->di_ctime_nsec);
288 286
289 ip->i_goal = be64_to_cpu(str->di_goal_meta); 287 ip->i_goal = be64_to_cpu(str->di_goal_meta);
290 di->di_generation = be64_to_cpu(str->di_generation); 288 ip->i_generation = be64_to_cpu(str->di_generation);
291 289
292 di->di_flags = be32_to_cpu(str->di_flags); 290 ip->i_diskflags = be32_to_cpu(str->di_flags);
293 gfs2_set_inode_flags(&ip->i_inode); 291 gfs2_set_inode_flags(&ip->i_inode);
294 height = be16_to_cpu(str->di_height); 292 height = be16_to_cpu(str->di_height);
295 if (unlikely(height > GFS2_MAX_META_HEIGHT)) 293 if (unlikely(height > GFS2_MAX_META_HEIGHT))
@@ -300,9 +298,9 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
300 if (unlikely(depth > GFS2_DIR_MAX_DEPTH)) 298 if (unlikely(depth > GFS2_DIR_MAX_DEPTH))
301 goto corrupt; 299 goto corrupt;
302 ip->i_depth = (u8)depth; 300 ip->i_depth = (u8)depth;
303 di->di_entries = be32_to_cpu(str->di_entries); 301 ip->i_entries = be32_to_cpu(str->di_entries);
304 302
305 di->di_eattr = be64_to_cpu(str->di_eattr); 303 ip->i_eattr = be64_to_cpu(str->di_eattr);
306 if (S_ISREG(ip->i_inode.i_mode)) 304 if (S_ISREG(ip->i_inode.i_mode))
307 gfs2_set_aops(&ip->i_inode); 305 gfs2_set_aops(&ip->i_inode);
308 306
@@ -388,7 +386,6 @@ int gfs2_dinode_dealloc(struct gfs2_inode *ip)
388 gfs2_free_di(rgd, ip); 386 gfs2_free_di(rgd, ip);
389 387
390 gfs2_trans_end(sdp); 388 gfs2_trans_end(sdp);
391 clear_bit(GLF_STICKY, &ip->i_gl->gl_flags);
392 389
393out_rg_gunlock: 390out_rg_gunlock:
394 gfs2_glock_dq_uninit(&al->al_rgd_gh); 391 gfs2_glock_dq_uninit(&al->al_rgd_gh);
@@ -690,7 +687,7 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name,
690 return error; 687 return error;
691 } 688 }
692 689
693 if (dip->i_di.di_entries == (u32)-1) 690 if (dip->i_entries == (u32)-1)
694 return -EFBIG; 691 return -EFBIG;
695 if (S_ISDIR(mode) && dip->i_inode.i_nlink == (u32)-1) 692 if (S_ISDIR(mode) && dip->i_inode.i_nlink == (u32)-1)
696 return -EMLINK; 693 return -EMLINK;
@@ -790,11 +787,11 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
790 di->di_flags = 0; 787 di->di_flags = 0;
791 788
792 if (S_ISREG(mode)) { 789 if (S_ISREG(mode)) {
793 if ((dip->i_di.di_flags & GFS2_DIF_INHERIT_JDATA) || 790 if ((dip->i_diskflags & GFS2_DIF_INHERIT_JDATA) ||
794 gfs2_tune_get(sdp, gt_new_files_jdata)) 791 gfs2_tune_get(sdp, gt_new_files_jdata))
795 di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA); 792 di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA);
796 } else if (S_ISDIR(mode)) { 793 } else if (S_ISDIR(mode)) {
797 di->di_flags |= cpu_to_be32(dip->i_di.di_flags & 794 di->di_flags |= cpu_to_be32(dip->i_diskflags &
798 GFS2_DIF_INHERIT_JDATA); 795 GFS2_DIF_INHERIT_JDATA);
799 } 796 }
800 797
@@ -1068,7 +1065,7 @@ int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
1068 struct qstr dotname; 1065 struct qstr dotname;
1069 int error; 1066 int error;
1070 1067
1071 if (ip->i_di.di_entries != 2) { 1068 if (ip->i_entries != 2) {
1072 if (gfs2_consist_inode(ip)) 1069 if (gfs2_consist_inode(ip))
1073 gfs2_dinode_print(ip); 1070 gfs2_dinode_print(ip);
1074 return -EIO; 1071 return -EIO;
@@ -1168,7 +1165,7 @@ int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len)
1168 return error; 1165 return error;
1169 } 1166 }
1170 1167
1171 if (!ip->i_di.di_size) { 1168 if (!ip->i_disksize) {
1172 gfs2_consist_inode(ip); 1169 gfs2_consist_inode(ip);
1173 error = -EIO; 1170 error = -EIO;
1174 goto out; 1171 goto out;
@@ -1178,7 +1175,7 @@ int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len)
1178 if (error) 1175 if (error)
1179 goto out; 1176 goto out;
1180 1177
1181 x = ip->i_di.di_size + 1; 1178 x = ip->i_disksize + 1;
1182 if (x > *len) { 1179 if (x > *len) {
1183 *buf = kmalloc(x, GFP_NOFS); 1180 *buf = kmalloc(x, GFP_NOFS);
1184 if (!*buf) { 1181 if (!*buf) {
@@ -1242,7 +1239,6 @@ int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
1242 1239
1243void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf) 1240void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
1244{ 1241{
1245 const struct gfs2_dinode_host *di = &ip->i_di;
1246 struct gfs2_dinode *str = buf; 1242 struct gfs2_dinode *str = buf;
1247 1243
1248 str->di_header.mh_magic = cpu_to_be32(GFS2_MAGIC); 1244 str->di_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
@@ -1256,7 +1252,7 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
1256 str->di_uid = cpu_to_be32(ip->i_inode.i_uid); 1252 str->di_uid = cpu_to_be32(ip->i_inode.i_uid);
1257 str->di_gid = cpu_to_be32(ip->i_inode.i_gid); 1253 str->di_gid = cpu_to_be32(ip->i_inode.i_gid);
1258 str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink); 1254 str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink);
1259 str->di_size = cpu_to_be64(di->di_size); 1255 str->di_size = cpu_to_be64(ip->i_disksize);
1260 str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode)); 1256 str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
1261 str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec); 1257 str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec);
1262 str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec); 1258 str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec);
@@ -1264,17 +1260,17 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
1264 1260
1265 str->di_goal_meta = cpu_to_be64(ip->i_goal); 1261 str->di_goal_meta = cpu_to_be64(ip->i_goal);
1266 str->di_goal_data = cpu_to_be64(ip->i_goal); 1262 str->di_goal_data = cpu_to_be64(ip->i_goal);
1267 str->di_generation = cpu_to_be64(di->di_generation); 1263 str->di_generation = cpu_to_be64(ip->i_generation);
1268 1264
1269 str->di_flags = cpu_to_be32(di->di_flags); 1265 str->di_flags = cpu_to_be32(ip->i_diskflags);
1270 str->di_height = cpu_to_be16(ip->i_height); 1266 str->di_height = cpu_to_be16(ip->i_height);
1271 str->di_payload_format = cpu_to_be32(S_ISDIR(ip->i_inode.i_mode) && 1267 str->di_payload_format = cpu_to_be32(S_ISDIR(ip->i_inode.i_mode) &&
1272 !(ip->i_di.di_flags & GFS2_DIF_EXHASH) ? 1268 !(ip->i_diskflags & GFS2_DIF_EXHASH) ?
1273 GFS2_FORMAT_DE : 0); 1269 GFS2_FORMAT_DE : 0);
1274 str->di_depth = cpu_to_be16(ip->i_depth); 1270 str->di_depth = cpu_to_be16(ip->i_depth);
1275 str->di_entries = cpu_to_be32(di->di_entries); 1271 str->di_entries = cpu_to_be32(ip->i_entries);
1276 1272
1277 str->di_eattr = cpu_to_be64(di->di_eattr); 1273 str->di_eattr = cpu_to_be64(ip->i_eattr);
1278 str->di_atime_nsec = cpu_to_be32(ip->i_inode.i_atime.tv_nsec); 1274 str->di_atime_nsec = cpu_to_be32(ip->i_inode.i_atime.tv_nsec);
1279 str->di_mtime_nsec = cpu_to_be32(ip->i_inode.i_mtime.tv_nsec); 1275 str->di_mtime_nsec = cpu_to_be32(ip->i_inode.i_mtime.tv_nsec);
1280 str->di_ctime_nsec = cpu_to_be32(ip->i_inode.i_ctime.tv_nsec); 1276 str->di_ctime_nsec = cpu_to_be32(ip->i_inode.i_ctime.tv_nsec);
@@ -1282,22 +1278,21 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
1282 1278
1283void gfs2_dinode_print(const struct gfs2_inode *ip) 1279void gfs2_dinode_print(const struct gfs2_inode *ip)
1284{ 1280{
1285 const struct gfs2_dinode_host *di = &ip->i_di;
1286
1287 printk(KERN_INFO " no_formal_ino = %llu\n", 1281 printk(KERN_INFO " no_formal_ino = %llu\n",
1288 (unsigned long long)ip->i_no_formal_ino); 1282 (unsigned long long)ip->i_no_formal_ino);
1289 printk(KERN_INFO " no_addr = %llu\n", 1283 printk(KERN_INFO " no_addr = %llu\n",
1290 (unsigned long long)ip->i_no_addr); 1284 (unsigned long long)ip->i_no_addr);
1291 printk(KERN_INFO " di_size = %llu\n", (unsigned long long)di->di_size); 1285 printk(KERN_INFO " i_disksize = %llu\n",
1286 (unsigned long long)ip->i_disksize);
1292 printk(KERN_INFO " blocks = %llu\n", 1287 printk(KERN_INFO " blocks = %llu\n",
1293 (unsigned long long)gfs2_get_inode_blocks(&ip->i_inode)); 1288 (unsigned long long)gfs2_get_inode_blocks(&ip->i_inode));
1294 printk(KERN_INFO " i_goal = %llu\n", 1289 printk(KERN_INFO " i_goal = %llu\n",
1295 (unsigned long long)ip->i_goal); 1290 (unsigned long long)ip->i_goal);
1296 printk(KERN_INFO " di_flags = 0x%.8X\n", di->di_flags); 1291 printk(KERN_INFO " i_diskflags = 0x%.8X\n", ip->i_diskflags);
1297 printk(KERN_INFO " i_height = %u\n", ip->i_height); 1292 printk(KERN_INFO " i_height = %u\n", ip->i_height);
1298 printk(KERN_INFO " i_depth = %u\n", ip->i_depth); 1293 printk(KERN_INFO " i_depth = %u\n", ip->i_depth);
1299 printk(KERN_INFO " di_entries = %u\n", di->di_entries); 1294 printk(KERN_INFO " i_entries = %u\n", ip->i_entries);
1300 printk(KERN_INFO " di_eattr = %llu\n", 1295 printk(KERN_INFO " i_eattr = %llu\n",
1301 (unsigned long long)di->di_eattr); 1296 (unsigned long long)ip->i_eattr);
1302} 1297}
1303 1298
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 2d43f69610a0..d5329364cdff 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -10,6 +10,7 @@
10#ifndef __INODE_DOT_H__ 10#ifndef __INODE_DOT_H__
11#define __INODE_DOT_H__ 11#define __INODE_DOT_H__
12 12
13#include <linux/fs.h>
13#include "util.h" 14#include "util.h"
14 15
15static inline int gfs2_is_stuffed(const struct gfs2_inode *ip) 16static inline int gfs2_is_stuffed(const struct gfs2_inode *ip)
@@ -19,7 +20,7 @@ static inline int gfs2_is_stuffed(const struct gfs2_inode *ip)
19 20
20static inline int gfs2_is_jdata(const struct gfs2_inode *ip) 21static inline int gfs2_is_jdata(const struct gfs2_inode *ip)
21{ 22{
22 return ip->i_di.di_flags & GFS2_DIF_JDATA; 23 return ip->i_diskflags & GFS2_DIF_JDATA;
23} 24}
24 25
25static inline int gfs2_is_writeback(const struct gfs2_inode *ip) 26static inline int gfs2_is_writeback(const struct gfs2_inode *ip)
@@ -97,5 +98,15 @@ struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
97void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf); 98void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
98void gfs2_dinode_print(const struct gfs2_inode *ip); 99void gfs2_dinode_print(const struct gfs2_inode *ip);
99 100
101extern const struct inode_operations gfs2_file_iops;
102extern const struct inode_operations gfs2_dir_iops;
103extern const struct inode_operations gfs2_symlink_iops;
104extern const struct file_operations gfs2_file_fops;
105extern const struct file_operations gfs2_dir_fops;
106extern const struct file_operations gfs2_file_fops_nolock;
107extern const struct file_operations gfs2_dir_fops_nolock;
108
109extern void gfs2_set_inode_flags(struct inode *inode);
110
100#endif /* __INODE_DOT_H__ */ 111#endif /* __INODE_DOT_H__ */
101 112
diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c
index 0c4cbe6c8285..1aa7eb6a0226 100644
--- a/fs/gfs2/locking/dlm/mount.c
+++ b/fs/gfs2/locking/dlm/mount.c
@@ -194,17 +194,25 @@ out:
194static void gdlm_recovery_done(void *lockspace, unsigned int jid, 194static void gdlm_recovery_done(void *lockspace, unsigned int jid,
195 unsigned int message) 195 unsigned int message)
196{ 196{
197 char env_jid[20];
198 char env_status[20];
199 char *envp[] = { env_jid, env_status, NULL };
197 struct gdlm_ls *ls = lockspace; 200 struct gdlm_ls *ls = lockspace;
198 ls->recover_jid_done = jid; 201 ls->recover_jid_done = jid;
199 ls->recover_jid_status = message; 202 ls->recover_jid_status = message;
200 kobject_uevent(&ls->kobj, KOBJ_CHANGE); 203 sprintf(env_jid, "JID=%d", jid);
204 sprintf(env_status, "RECOVERY=%s",
205 message == LM_RD_SUCCESS ? "Done" : "Failed");
206 kobject_uevent_env(&ls->kobj, KOBJ_CHANGE, envp);
201} 207}
202 208
203static void gdlm_others_may_mount(void *lockspace) 209static void gdlm_others_may_mount(void *lockspace)
204{ 210{
211 char *message = "FIRSTMOUNT=Done";
212 char *envp[] = { message, NULL };
205 struct gdlm_ls *ls = lockspace; 213 struct gdlm_ls *ls = lockspace;
206 ls->first_done = 1; 214 ls->first_done = 1;
207 kobject_uevent(&ls->kobj, KOBJ_CHANGE); 215 kobject_uevent_env(&ls->kobj, KOBJ_CHANGE, envp);
208} 216}
209 217
210/* Userspace gets the offline uevent, blocks new gfs locks on 218/* Userspace gets the offline uevent, blocks new gfs locks on
diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c
index 4ec571c3d8a9..9b7edcf7bd49 100644
--- a/fs/gfs2/locking/dlm/sysfs.c
+++ b/fs/gfs2/locking/dlm/sysfs.c
@@ -195,9 +195,23 @@ void gdlm_kobject_release(struct gdlm_ls *ls)
195 kobject_put(&ls->kobj); 195 kobject_put(&ls->kobj);
196} 196}
197 197
198static int gdlm_uevent(struct kset *kset, struct kobject *kobj,
199 struct kobj_uevent_env *env)
200{
201 struct gdlm_ls *ls = container_of(kobj, struct gdlm_ls, kobj);
202 add_uevent_var(env, "LOCKTABLE=%s:%s", ls->clustername, ls->fsname);
203 add_uevent_var(env, "LOCKPROTO=lock_dlm");
204 return 0;
205}
206
207static struct kset_uevent_ops gdlm_uevent_ops = {
208 .uevent = gdlm_uevent,
209};
210
211
198int gdlm_sysfs_init(void) 212int gdlm_sysfs_init(void)
199{ 213{
200 gdlm_kset = kset_create_and_add("lock_dlm", NULL, kernel_kobj); 214 gdlm_kset = kset_create_and_add("lock_dlm", &gdlm_uevent_ops, kernel_kobj);
201 if (!gdlm_kset) { 215 if (!gdlm_kset) {
202 printk(KERN_WARNING "%s: can not create kset\n", __func__); 216 printk(KERN_WARNING "%s: can not create kset\n", __func__);
203 return -ENOMEM; 217 return -ENOMEM;
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index bb2cc303ac29..7cacfde32194 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -19,7 +19,7 @@
19 19
20#include "gfs2.h" 20#include "gfs2.h"
21#include "incore.h" 21#include "incore.h"
22#include "ops_fstype.h" 22#include "super.h"
23#include "sys.h" 23#include "sys.h"
24#include "util.h" 24#include "util.h"
25#include "glock.h" 25#include "glock.h"
@@ -30,6 +30,7 @@ static void gfs2_init_inode_once(void *foo)
30 30
31 inode_init_once(&ip->i_inode); 31 inode_init_once(&ip->i_inode);
32 init_rwsem(&ip->i_rw_mutex); 32 init_rwsem(&ip->i_rw_mutex);
33 INIT_LIST_HEAD(&ip->i_trunc_list);
33 ip->i_alloc = NULL; 34 ip->i_alloc = NULL;
34} 35}
35 36
@@ -42,7 +43,7 @@ static void gfs2_init_glock_once(void *foo)
42 INIT_LIST_HEAD(&gl->gl_holders); 43 INIT_LIST_HEAD(&gl->gl_holders);
43 gl->gl_lvb = NULL; 44 gl->gl_lvb = NULL;
44 atomic_set(&gl->gl_lvb_count, 0); 45 atomic_set(&gl->gl_lvb_count, 0);
45 INIT_LIST_HEAD(&gl->gl_reclaim); 46 INIT_LIST_HEAD(&gl->gl_lru);
46 INIT_LIST_HEAD(&gl->gl_ail_list); 47 INIT_LIST_HEAD(&gl->gl_ail_list);
47 atomic_set(&gl->gl_ail_count, 0); 48 atomic_set(&gl->gl_ail_count, 0);
48} 49}
@@ -93,6 +94,12 @@ static int __init init_gfs2_fs(void)
93 if (!gfs2_rgrpd_cachep) 94 if (!gfs2_rgrpd_cachep)
94 goto fail; 95 goto fail;
95 96
97 gfs2_quotad_cachep = kmem_cache_create("gfs2_quotad",
98 sizeof(struct gfs2_quota_data),
99 0, 0, NULL);
100 if (!gfs2_quotad_cachep)
101 goto fail;
102
96 error = register_filesystem(&gfs2_fs_type); 103 error = register_filesystem(&gfs2_fs_type);
97 if (error) 104 if (error)
98 goto fail; 105 goto fail;
@@ -112,6 +119,9 @@ fail_unregister:
112fail: 119fail:
113 gfs2_glock_exit(); 120 gfs2_glock_exit();
114 121
122 if (gfs2_quotad_cachep)
123 kmem_cache_destroy(gfs2_quotad_cachep);
124
115 if (gfs2_rgrpd_cachep) 125 if (gfs2_rgrpd_cachep)
116 kmem_cache_destroy(gfs2_rgrpd_cachep); 126 kmem_cache_destroy(gfs2_rgrpd_cachep);
117 127
@@ -140,6 +150,7 @@ static void __exit exit_gfs2_fs(void)
140 unregister_filesystem(&gfs2_fs_type); 150 unregister_filesystem(&gfs2_fs_type);
141 unregister_filesystem(&gfs2meta_fs_type); 151 unregister_filesystem(&gfs2meta_fs_type);
142 152
153 kmem_cache_destroy(gfs2_quotad_cachep);
143 kmem_cache_destroy(gfs2_rgrpd_cachep); 154 kmem_cache_destroy(gfs2_rgrpd_cachep);
144 kmem_cache_destroy(gfs2_bufdata_cachep); 155 kmem_cache_destroy(gfs2_bufdata_cachep);
145 kmem_cache_destroy(gfs2_inode_cachep); 156 kmem_cache_destroy(gfs2_inode_cachep);
diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c
index f96eb90a2cfa..3cb0a44ba023 100644
--- a/fs/gfs2/mount.c
+++ b/fs/gfs2/mount.c
@@ -32,7 +32,6 @@ enum {
32 Opt_debug, 32 Opt_debug,
33 Opt_nodebug, 33 Opt_nodebug,
34 Opt_upgrade, 34 Opt_upgrade,
35 Opt_num_glockd,
36 Opt_acl, 35 Opt_acl,
37 Opt_noacl, 36 Opt_noacl,
38 Opt_quota_off, 37 Opt_quota_off,
@@ -57,7 +56,6 @@ static const match_table_t tokens = {
57 {Opt_debug, "debug"}, 56 {Opt_debug, "debug"},
58 {Opt_nodebug, "nodebug"}, 57 {Opt_nodebug, "nodebug"},
59 {Opt_upgrade, "upgrade"}, 58 {Opt_upgrade, "upgrade"},
60 {Opt_num_glockd, "num_glockd=%d"},
61 {Opt_acl, "acl"}, 59 {Opt_acl, "acl"},
62 {Opt_noacl, "noacl"}, 60 {Opt_noacl, "noacl"},
63 {Opt_quota_off, "quota=off"}, 61 {Opt_quota_off, "quota=off"},
@@ -87,16 +85,7 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
87 int error = 0; 85 int error = 0;
88 86
89 if (!remount) { 87 if (!remount) {
90 /* If someone preloaded options, use those instead */
91 spin_lock(&gfs2_sys_margs_lock);
92 if (gfs2_sys_margs) {
93 data = gfs2_sys_margs;
94 gfs2_sys_margs = NULL;
95 }
96 spin_unlock(&gfs2_sys_margs_lock);
97
98 /* Set some defaults */ 88 /* Set some defaults */
99 args->ar_num_glockd = GFS2_GLOCKD_DEFAULT;
100 args->ar_quota = GFS2_QUOTA_DEFAULT; 89 args->ar_quota = GFS2_QUOTA_DEFAULT;
101 args->ar_data = GFS2_DATA_DEFAULT; 90 args->ar_data = GFS2_DATA_DEFAULT;
102 } 91 }
@@ -105,7 +94,7 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
105 process them */ 94 process them */
106 95
107 for (options = data; (o = strsep(&options, ",")); ) { 96 for (options = data; (o = strsep(&options, ",")); ) {
108 int token, option; 97 int token;
109 substring_t tmp[MAX_OPT_ARGS]; 98 substring_t tmp[MAX_OPT_ARGS];
110 99
111 if (!*o) 100 if (!*o)
@@ -196,22 +185,6 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
196 goto cant_remount; 185 goto cant_remount;
197 args->ar_upgrade = 1; 186 args->ar_upgrade = 1;
198 break; 187 break;
199 case Opt_num_glockd:
200 if ((error = match_int(&tmp[0], &option))) {
201 fs_info(sdp, "problem getting num_glockd\n");
202 goto out_error;
203 }
204
205 if (remount && option != args->ar_num_glockd)
206 goto cant_remount;
207 if (!option || option > GFS2_GLOCKD_MAX) {
208 fs_info(sdp, "0 < num_glockd <= %u (not %u)\n",
209 GFS2_GLOCKD_MAX, option);
210 error = -EINVAL;
211 goto out_error;
212 }
213 args->ar_num_glockd = option;
214 break;
215 case Opt_acl: 188 case Opt_acl:
216 args->ar_posix_acl = 1; 189 args->ar_posix_acl = 1;
217 sdp->sd_vfs->s_flags |= MS_POSIXACL; 190 sdp->sd_vfs->s_flags |= MS_POSIXACL;
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index 27563816e1c5..4ddab67867eb 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -210,25 +210,23 @@ static int gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc
210{ 210{
211 struct inode *inode = page->mapping->host; 211 struct inode *inode = page->mapping->host;
212 struct gfs2_sbd *sdp = GFS2_SB(inode); 212 struct gfs2_sbd *sdp = GFS2_SB(inode);
213 int error; 213 int ret;
214 int done_trans = 0; 214 int done_trans = 0;
215 215
216 error = gfs2_writepage_common(page, wbc);
217 if (error <= 0)
218 return error;
219
220 if (PageChecked(page)) { 216 if (PageChecked(page)) {
221 if (wbc->sync_mode != WB_SYNC_ALL) 217 if (wbc->sync_mode != WB_SYNC_ALL)
222 goto out_ignore; 218 goto out_ignore;
223 error = gfs2_trans_begin(sdp, RES_DINODE + 1, 0); 219 ret = gfs2_trans_begin(sdp, RES_DINODE + 1, 0);
224 if (error) 220 if (ret)
225 goto out_ignore; 221 goto out_ignore;
226 done_trans = 1; 222 done_trans = 1;
227 } 223 }
228 error = __gfs2_jdata_writepage(page, wbc); 224 ret = gfs2_writepage_common(page, wbc);
225 if (ret > 0)
226 ret = __gfs2_jdata_writepage(page, wbc);
229 if (done_trans) 227 if (done_trans)
230 gfs2_trans_end(sdp); 228 gfs2_trans_end(sdp);
231 return error; 229 return ret;
232 230
233out_ignore: 231out_ignore:
234 redirty_page_for_writepage(wbc, page); 232 redirty_page_for_writepage(wbc, page);
@@ -453,8 +451,8 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
453 451
454 kaddr = kmap_atomic(page, KM_USER0); 452 kaddr = kmap_atomic(page, KM_USER0);
455 memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), 453 memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode),
456 ip->i_di.di_size); 454 ip->i_disksize);
457 memset(kaddr + ip->i_di.di_size, 0, PAGE_CACHE_SIZE - ip->i_di.di_size); 455 memset(kaddr + ip->i_disksize, 0, PAGE_CACHE_SIZE - ip->i_disksize);
458 kunmap_atomic(kaddr, KM_USER0); 456 kunmap_atomic(kaddr, KM_USER0);
459 flush_dcache_page(page); 457 flush_dcache_page(page);
460 brelse(dibh); 458 brelse(dibh);
@@ -627,7 +625,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
627{ 625{
628 struct gfs2_inode *ip = GFS2_I(mapping->host); 626 struct gfs2_inode *ip = GFS2_I(mapping->host);
629 struct gfs2_sbd *sdp = GFS2_SB(mapping->host); 627 struct gfs2_sbd *sdp = GFS2_SB(mapping->host);
630 unsigned int data_blocks, ind_blocks, rblocks; 628 unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
631 int alloc_required; 629 int alloc_required;
632 int error = 0; 630 int error = 0;
633 struct gfs2_alloc *al; 631 struct gfs2_alloc *al;
@@ -641,11 +639,13 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
641 if (unlikely(error)) 639 if (unlikely(error))
642 goto out_uninit; 640 goto out_uninit;
643 641
644 gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks);
645 error = gfs2_write_alloc_required(ip, pos, len, &alloc_required); 642 error = gfs2_write_alloc_required(ip, pos, len, &alloc_required);
646 if (error) 643 if (error)
647 goto out_unlock; 644 goto out_unlock;
648 645
646 if (alloc_required || gfs2_is_jdata(ip))
647 gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks);
648
649 if (alloc_required) { 649 if (alloc_required) {
650 al = gfs2_alloc_get(ip); 650 al = gfs2_alloc_get(ip);
651 if (!al) { 651 if (!al) {
@@ -675,7 +675,8 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
675 goto out_trans_fail; 675 goto out_trans_fail;
676 676
677 error = -ENOMEM; 677 error = -ENOMEM;
678 page = __grab_cache_page(mapping, index); 678 flags |= AOP_FLAG_NOFS;
679 page = grab_cache_page_write_begin(mapping, index, flags);
679 *pagep = page; 680 *pagep = page;
680 if (unlikely(!page)) 681 if (unlikely(!page))
681 goto out_endtrans; 682 goto out_endtrans;
@@ -782,7 +783,7 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
782 783
783 if (inode->i_size < to) { 784 if (inode->i_size < to) {
784 i_size_write(inode, to); 785 i_size_write(inode, to);
785 ip->i_di.di_size = inode->i_size; 786 ip->i_disksize = inode->i_size;
786 di->di_size = cpu_to_be64(inode->i_size); 787 di->di_size = cpu_to_be64(inode->i_size);
787 mark_inode_dirty(inode); 788 mark_inode_dirty(inode);
788 } 789 }
@@ -847,9 +848,9 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
847 848
848 ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); 849 ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
849 850
850 if (likely(ret >= 0) && (inode->i_size > ip->i_di.di_size)) { 851 if (likely(ret >= 0) && (inode->i_size > ip->i_disksize)) {
851 di = (struct gfs2_dinode *)dibh->b_data; 852 di = (struct gfs2_dinode *)dibh->b_data;
852 ip->i_di.di_size = inode->i_size; 853 ip->i_disksize = inode->i_size;
853 di->di_size = cpu_to_be64(inode->i_size); 854 di->di_size = cpu_to_be64(inode->i_size);
854 mark_inode_dirty(inode); 855 mark_inode_dirty(inode);
855 } 856 }
diff --git a/fs/gfs2/ops_dentry.c b/fs/gfs2/ops_dentry.c
index 4a5e676b4420..c2ad36330ca3 100644
--- a/fs/gfs2/ops_dentry.c
+++ b/fs/gfs2/ops_dentry.c
@@ -19,7 +19,7 @@
19#include "incore.h" 19#include "incore.h"
20#include "dir.h" 20#include "dir.h"
21#include "glock.h" 21#include "glock.h"
22#include "ops_dentry.h" 22#include "super.h"
23#include "util.h" 23#include "util.h"
24#include "inode.h" 24#include "inode.h"
25 25
diff --git a/fs/gfs2/ops_dentry.h b/fs/gfs2/ops_dentry.h
deleted file mode 100644
index 5caa3db4d3f5..000000000000
--- a/fs/gfs2/ops_dentry.h
+++ /dev/null
@@ -1,17 +0,0 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __OPS_DENTRY_DOT_H__
11#define __OPS_DENTRY_DOT_H__
12
13#include <linux/dcache.h>
14
15extern struct dentry_operations gfs2_dops;
16
17#endif /* __OPS_DENTRY_DOT_H__ */
diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c
index bbb8c36403a9..7fdeb14ddd1a 100644
--- a/fs/gfs2/ops_export.c
+++ b/fs/gfs2/ops_export.c
@@ -22,8 +22,7 @@
22#include "glock.h" 22#include "glock.h"
23#include "glops.h" 23#include "glops.h"
24#include "inode.h" 24#include "inode.h"
25#include "ops_dentry.h" 25#include "super.h"
26#include "ops_fstype.h"
27#include "rgrp.h" 26#include "rgrp.h"
28#include "util.h" 27#include "util.h"
29 28
@@ -214,7 +213,7 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
214 } 213 }
215 214
216 error = -EIO; 215 error = -EIO;
217 if (GFS2_I(inode)->i_di.di_flags & GFS2_DIF_SYSTEM) { 216 if (GFS2_I(inode)->i_diskflags & GFS2_DIF_SYSTEM) {
218 iput(inode); 217 iput(inode);
219 goto fail; 218 goto fail;
220 } 219 }
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index 3a747f8e2188..93fe41b67f97 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -39,7 +39,6 @@
39#include "util.h" 39#include "util.h"
40#include "eaops.h" 40#include "eaops.h"
41#include "ops_address.h" 41#include "ops_address.h"
42#include "ops_inode.h"
43 42
44/** 43/**
45 * gfs2_llseek - seek to a location in a file 44 * gfs2_llseek - seek to a location in a file
@@ -158,8 +157,8 @@ static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
158 if (error) 157 if (error)
159 return error; 158 return error;
160 159
161 fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_di.di_flags); 160 fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_diskflags);
162 if (!S_ISDIR(inode->i_mode) && ip->i_di.di_flags & GFS2_DIF_JDATA) 161 if (!S_ISDIR(inode->i_mode) && ip->i_diskflags & GFS2_DIF_JDATA)
163 fsflags |= FS_JOURNAL_DATA_FL; 162 fsflags |= FS_JOURNAL_DATA_FL;
164 if (put_user(fsflags, ptr)) 163 if (put_user(fsflags, ptr))
165 error = -EFAULT; 164 error = -EFAULT;
@@ -172,17 +171,16 @@ static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
172void gfs2_set_inode_flags(struct inode *inode) 171void gfs2_set_inode_flags(struct inode *inode)
173{ 172{
174 struct gfs2_inode *ip = GFS2_I(inode); 173 struct gfs2_inode *ip = GFS2_I(inode);
175 struct gfs2_dinode_host *di = &ip->i_di;
176 unsigned int flags = inode->i_flags; 174 unsigned int flags = inode->i_flags;
177 175
178 flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); 176 flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
179 if (di->di_flags & GFS2_DIF_IMMUTABLE) 177 if (ip->i_diskflags & GFS2_DIF_IMMUTABLE)
180 flags |= S_IMMUTABLE; 178 flags |= S_IMMUTABLE;
181 if (di->di_flags & GFS2_DIF_APPENDONLY) 179 if (ip->i_diskflags & GFS2_DIF_APPENDONLY)
182 flags |= S_APPEND; 180 flags |= S_APPEND;
183 if (di->di_flags & GFS2_DIF_NOATIME) 181 if (ip->i_diskflags & GFS2_DIF_NOATIME)
184 flags |= S_NOATIME; 182 flags |= S_NOATIME;
185 if (di->di_flags & GFS2_DIF_SYNC) 183 if (ip->i_diskflags & GFS2_DIF_SYNC)
186 flags |= S_SYNC; 184 flags |= S_SYNC;
187 inode->i_flags = flags; 185 inode->i_flags = flags;
188} 186}
@@ -221,7 +219,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
221 if (error) 219 if (error)
222 goto out_drop_write; 220 goto out_drop_write;
223 221
224 flags = ip->i_di.di_flags; 222 flags = ip->i_diskflags;
225 new_flags = (flags & ~mask) | (reqflags & mask); 223 new_flags = (flags & ~mask) | (reqflags & mask);
226 if ((new_flags ^ flags) == 0) 224 if ((new_flags ^ flags) == 0)
227 goto out; 225 goto out;
@@ -260,7 +258,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
260 if (error) 258 if (error)
261 goto out_trans_end; 259 goto out_trans_end;
262 gfs2_trans_add_bh(ip->i_gl, bh, 1); 260 gfs2_trans_add_bh(ip->i_gl, bh, 1);
263 ip->i_di.di_flags = new_flags; 261 ip->i_diskflags = new_flags;
264 gfs2_dinode_out(ip, bh->b_data); 262 gfs2_dinode_out(ip, bh->b_data);
265 brelse(bh); 263 brelse(bh);
266 gfs2_set_inode_flags(inode); 264 gfs2_set_inode_flags(inode);
@@ -344,7 +342,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
344 struct gfs2_inode *ip = GFS2_I(inode); 342 struct gfs2_inode *ip = GFS2_I(inode);
345 struct gfs2_sbd *sdp = GFS2_SB(inode); 343 struct gfs2_sbd *sdp = GFS2_SB(inode);
346 unsigned long last_index; 344 unsigned long last_index;
347 u64 pos = page->index << (PAGE_CACHE_SIZE - inode->i_blkbits); 345 u64 pos = page->index << PAGE_CACHE_SHIFT;
348 unsigned int data_blocks, ind_blocks, rblocks; 346 unsigned int data_blocks, ind_blocks, rblocks;
349 int alloc_required = 0; 347 int alloc_required = 0;
350 struct gfs2_holder gh; 348 struct gfs2_holder gh;
@@ -357,7 +355,6 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
357 goto out; 355 goto out;
358 356
359 set_bit(GIF_SW_PAGED, &ip->i_flags); 357 set_bit(GIF_SW_PAGED, &ip->i_flags);
360 gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks);
361 ret = gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE, &alloc_required); 358 ret = gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE, &alloc_required);
362 if (ret || !alloc_required) 359 if (ret || !alloc_required)
363 goto out_unlock; 360 goto out_unlock;
@@ -369,6 +366,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
369 ret = gfs2_quota_lock_check(ip); 366 ret = gfs2_quota_lock_check(ip);
370 if (ret) 367 if (ret)
371 goto out_alloc_put; 368 goto out_alloc_put;
369 gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks);
372 al->al_requested = data_blocks + ind_blocks; 370 al->al_requested = data_blocks + ind_blocks;
373 ret = gfs2_inplace_reserve(ip); 371 ret = gfs2_inplace_reserve(ip);
374 if (ret) 372 if (ret)
@@ -479,7 +477,7 @@ static int gfs2_open(struct inode *inode, struct file *file)
479 goto fail; 477 goto fail;
480 478
481 if (!(file->f_flags & O_LARGEFILE) && 479 if (!(file->f_flags & O_LARGEFILE) &&
482 ip->i_di.di_size > MAX_NON_LFS) { 480 ip->i_disksize > MAX_NON_LFS) {
483 error = -EOVERFLOW; 481 error = -EOVERFLOW;
484 goto fail_gunlock; 482 goto fail_gunlock;
485 } 483 }
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index b117fcf2c4f5..f91eebdde581 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -22,20 +22,18 @@
22#include "gfs2.h" 22#include "gfs2.h"
23#include "incore.h" 23#include "incore.h"
24#include "bmap.h" 24#include "bmap.h"
25#include "daemon.h"
26#include "glock.h" 25#include "glock.h"
27#include "glops.h" 26#include "glops.h"
28#include "inode.h" 27#include "inode.h"
29#include "mount.h" 28#include "mount.h"
30#include "ops_fstype.h"
31#include "ops_dentry.h"
32#include "ops_super.h"
33#include "recovery.h" 29#include "recovery.h"
34#include "rgrp.h" 30#include "rgrp.h"
35#include "super.h" 31#include "super.h"
36#include "sys.h" 32#include "sys.h"
37#include "util.h" 33#include "util.h"
38#include "log.h" 34#include "log.h"
35#include "quota.h"
36#include "dir.h"
39 37
40#define DO 0 38#define DO 0
41#define UNDO 1 39#define UNDO 1
@@ -58,12 +56,10 @@ static void gfs2_tune_init(struct gfs2_tune *gt)
58{ 56{
59 spin_lock_init(&gt->gt_spin); 57 spin_lock_init(&gt->gt_spin);
60 58
61 gt->gt_demote_secs = 300;
62 gt->gt_incore_log_blocks = 1024; 59 gt->gt_incore_log_blocks = 1024;
63 gt->gt_log_flush_secs = 60; 60 gt->gt_log_flush_secs = 60;
64 gt->gt_recoverd_secs = 60; 61 gt->gt_recoverd_secs = 60;
65 gt->gt_logd_secs = 1; 62 gt->gt_logd_secs = 1;
66 gt->gt_quotad_secs = 5;
67 gt->gt_quota_simul_sync = 64; 63 gt->gt_quota_simul_sync = 64;
68 gt->gt_quota_warn_period = 10; 64 gt->gt_quota_warn_period = 10;
69 gt->gt_quota_scale_num = 1; 65 gt->gt_quota_scale_num = 1;
@@ -91,10 +87,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
91 87
92 gfs2_tune_init(&sdp->sd_tune); 88 gfs2_tune_init(&sdp->sd_tune);
93 89
94 INIT_LIST_HEAD(&sdp->sd_reclaim_list);
95 spin_lock_init(&sdp->sd_reclaim_lock);
96 init_waitqueue_head(&sdp->sd_reclaim_wq);
97
98 mutex_init(&sdp->sd_inum_mutex); 90 mutex_init(&sdp->sd_inum_mutex);
99 spin_lock_init(&sdp->sd_statfs_spin); 91 spin_lock_init(&sdp->sd_statfs_spin);
100 92
@@ -110,6 +102,9 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
110 INIT_LIST_HEAD(&sdp->sd_quota_list); 102 INIT_LIST_HEAD(&sdp->sd_quota_list);
111 spin_lock_init(&sdp->sd_quota_spin); 103 spin_lock_init(&sdp->sd_quota_spin);
112 mutex_init(&sdp->sd_quota_mutex); 104 mutex_init(&sdp->sd_quota_mutex);
105 init_waitqueue_head(&sdp->sd_quota_wait);
106 INIT_LIST_HEAD(&sdp->sd_trunc_list);
107 spin_lock_init(&sdp->sd_trunc_lock);
113 108
114 spin_lock_init(&sdp->sd_log_lock); 109 spin_lock_init(&sdp->sd_log_lock);
115 110
@@ -443,24 +438,11 @@ out:
443static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh, 438static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh,
444 int undo) 439 int undo)
445{ 440{
446 struct task_struct *p;
447 int error = 0; 441 int error = 0;
448 442
449 if (undo) 443 if (undo)
450 goto fail_trans; 444 goto fail_trans;
451 445
452 for (sdp->sd_glockd_num = 0;
453 sdp->sd_glockd_num < sdp->sd_args.ar_num_glockd;
454 sdp->sd_glockd_num++) {
455 p = kthread_run(gfs2_glockd, sdp, "gfs2_glockd");
456 error = IS_ERR(p);
457 if (error) {
458 fs_err(sdp, "can't start glockd thread: %d\n", error);
459 goto fail;
460 }
461 sdp->sd_glockd_process[sdp->sd_glockd_num] = p;
462 }
463
464 error = gfs2_glock_nq_num(sdp, 446 error = gfs2_glock_nq_num(sdp,
465 GFS2_MOUNT_LOCK, &gfs2_nondisk_glops, 447 GFS2_MOUNT_LOCK, &gfs2_nondisk_glops,
466 LM_ST_EXCLUSIVE, LM_FLAG_NOEXP | GL_NOCACHE, 448 LM_ST_EXCLUSIVE, LM_FLAG_NOEXP | GL_NOCACHE,
@@ -493,7 +475,6 @@ static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh,
493 fs_err(sdp, "can't create transaction glock: %d\n", error); 475 fs_err(sdp, "can't create transaction glock: %d\n", error);
494 goto fail_rename; 476 goto fail_rename;
495 } 477 }
496 set_bit(GLF_STICKY, &sdp->sd_trans_gl->gl_flags);
497 478
498 return 0; 479 return 0;
499 480
@@ -506,9 +487,6 @@ fail_live:
506fail_mount: 487fail_mount:
507 gfs2_glock_dq_uninit(mount_gh); 488 gfs2_glock_dq_uninit(mount_gh);
508fail: 489fail:
509 while (sdp->sd_glockd_num--)
510 kthread_stop(sdp->sd_glockd_process[sdp->sd_glockd_num]);
511
512 return error; 490 return error;
513} 491}
514 492
@@ -620,7 +598,7 @@ static int map_journal_extents(struct gfs2_sbd *sdp)
620 598
621 prev_db = 0; 599 prev_db = 0;
622 600
623 for (lb = 0; lb < ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift; lb++) { 601 for (lb = 0; lb < ip->i_disksize >> sdp->sd_sb.sb_bsize_shift; lb++) {
624 bh.b_state = 0; 602 bh.b_state = 0;
625 bh.b_blocknr = 0; 603 bh.b_blocknr = 0;
626 bh.b_size = 1 << ip->i_inode.i_blkbits; 604 bh.b_size = 1 << ip->i_inode.i_blkbits;
@@ -661,6 +639,72 @@ static void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp)
661 sdp->sd_lockstruct.ls_lockspace); 639 sdp->sd_lockstruct.ls_lockspace);
662} 640}
663 641
642/**
643 * gfs2_jindex_hold - Grab a lock on the jindex
644 * @sdp: The GFS2 superblock
645 * @ji_gh: the holder for the jindex glock
646 *
647 * Returns: errno
648 */
649
650static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
651{
652 struct gfs2_inode *dip = GFS2_I(sdp->sd_jindex);
653 struct qstr name;
654 char buf[20];
655 struct gfs2_jdesc *jd;
656 int error;
657
658 name.name = buf;
659
660 mutex_lock(&sdp->sd_jindex_mutex);
661
662 for (;;) {
663 error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, ji_gh);
664 if (error)
665 break;
666
667 name.len = sprintf(buf, "journal%u", sdp->sd_journals);
668 name.hash = gfs2_disk_hash(name.name, name.len);
669
670 error = gfs2_dir_check(sdp->sd_jindex, &name, NULL);
671 if (error == -ENOENT) {
672 error = 0;
673 break;
674 }
675
676 gfs2_glock_dq_uninit(ji_gh);
677
678 if (error)
679 break;
680
681 error = -ENOMEM;
682 jd = kzalloc(sizeof(struct gfs2_jdesc), GFP_KERNEL);
683 if (!jd)
684 break;
685
686 INIT_LIST_HEAD(&jd->extent_list);
687 jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1);
688 if (!jd->jd_inode || IS_ERR(jd->jd_inode)) {
689 if (!jd->jd_inode)
690 error = -ENOENT;
691 else
692 error = PTR_ERR(jd->jd_inode);
693 kfree(jd);
694 break;
695 }
696
697 spin_lock(&sdp->sd_jindex_spin);
698 jd->jd_jid = sdp->sd_journals++;
699 list_add_tail(&jd->jd_list, &sdp->sd_jindex_list);
700 spin_unlock(&sdp->sd_jindex_spin);
701 }
702
703 mutex_unlock(&sdp->sd_jindex_mutex);
704
705 return error;
706}
707
664static int init_journal(struct gfs2_sbd *sdp, int undo) 708static int init_journal(struct gfs2_sbd *sdp, int undo)
665{ 709{
666 struct inode *master = sdp->sd_master_dir->d_inode; 710 struct inode *master = sdp->sd_master_dir->d_inode;
@@ -681,7 +725,6 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
681 return PTR_ERR(sdp->sd_jindex); 725 return PTR_ERR(sdp->sd_jindex);
682 } 726 }
683 ip = GFS2_I(sdp->sd_jindex); 727 ip = GFS2_I(sdp->sd_jindex);
684 set_bit(GLF_STICKY, &ip->i_gl->gl_flags);
685 728
686 /* Load in the journal index special file */ 729 /* Load in the journal index special file */
687 730
@@ -832,7 +875,6 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
832 goto fail_statfs; 875 goto fail_statfs;
833 } 876 }
834 ip = GFS2_I(sdp->sd_rindex); 877 ip = GFS2_I(sdp->sd_rindex);
835 set_bit(GLF_STICKY, &ip->i_gl->gl_flags);
836 sdp->sd_rindex_uptodate = 0; 878 sdp->sd_rindex_uptodate = 0;
837 879
838 /* Read in the quota inode */ 880 /* Read in the quota inode */
@@ -973,9 +1015,6 @@ static int init_threads(struct gfs2_sbd *sdp, int undo)
973 } 1015 }
974 sdp->sd_logd_process = p; 1016 sdp->sd_logd_process = p;
975 1017
976 sdp->sd_statfs_sync_time = jiffies;
977 sdp->sd_quota_sync_time = jiffies;
978
979 p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad"); 1018 p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad");
980 error = IS_ERR(p); 1019 error = IS_ERR(p);
981 if (error) { 1020 if (error) {
@@ -1224,17 +1263,21 @@ static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
1224static void gfs2_kill_sb(struct super_block *sb) 1263static void gfs2_kill_sb(struct super_block *sb)
1225{ 1264{
1226 struct gfs2_sbd *sdp = sb->s_fs_info; 1265 struct gfs2_sbd *sdp = sb->s_fs_info;
1227 if (sdp) { 1266
1228 gfs2_meta_syncfs(sdp); 1267 if (sdp == NULL) {
1229 dput(sdp->sd_root_dir); 1268 kill_block_super(sb);
1230 dput(sdp->sd_master_dir); 1269 return;
1231 sdp->sd_root_dir = NULL;
1232 sdp->sd_master_dir = NULL;
1233 } 1270 }
1271
1272 gfs2_meta_syncfs(sdp);
1273 dput(sdp->sd_root_dir);
1274 dput(sdp->sd_master_dir);
1275 sdp->sd_root_dir = NULL;
1276 sdp->sd_master_dir = NULL;
1234 shrink_dcache_sb(sb); 1277 shrink_dcache_sb(sb);
1235 kill_block_super(sb); 1278 kill_block_super(sb);
1236 if (sdp) 1279 gfs2_delete_debugfs_file(sdp);
1237 gfs2_delete_debugfs_file(sdp); 1280 kfree(sdp);
1238} 1281}
1239 1282
1240struct file_system_type gfs2_fs_type = { 1283struct file_system_type gfs2_fs_type = {
diff --git a/fs/gfs2/ops_fstype.h b/fs/gfs2/ops_fstype.h
deleted file mode 100644
index da8490511836..000000000000
--- a/fs/gfs2/ops_fstype.h
+++ /dev/null
@@ -1,19 +0,0 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __OPS_FSTYPE_DOT_H__
11#define __OPS_FSTYPE_DOT_H__
12
13#include <linux/fs.h>
14
15extern struct file_system_type gfs2_fs_type;
16extern struct file_system_type gfs2meta_fs_type;
17extern const struct export_operations gfs2_export_ops;
18
19#endif /* __OPS_FSTYPE_DOT_H__ */
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index d232991b9046..49877546beb9 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -19,6 +19,7 @@
19#include <linux/gfs2_ondisk.h> 19#include <linux/gfs2_ondisk.h>
20#include <linux/crc32.h> 20#include <linux/crc32.h>
21#include <linux/lm_interface.h> 21#include <linux/lm_interface.h>
22#include <linux/fiemap.h>
22#include <asm/uaccess.h> 23#include <asm/uaccess.h>
23 24
24#include "gfs2.h" 25#include "gfs2.h"
@@ -31,12 +32,11 @@
31#include "glock.h" 32#include "glock.h"
32#include "inode.h" 33#include "inode.h"
33#include "meta_io.h" 34#include "meta_io.h"
34#include "ops_dentry.h"
35#include "ops_inode.h"
36#include "quota.h" 35#include "quota.h"
37#include "rgrp.h" 36#include "rgrp.h"
38#include "trans.h" 37#include "trans.h"
39#include "util.h" 38#include "util.h"
39#include "super.h"
40 40
41/** 41/**
42 * gfs2_create - Create a file 42 * gfs2_create - Create a file
@@ -185,7 +185,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
185 if (!dip->i_inode.i_nlink) 185 if (!dip->i_inode.i_nlink)
186 goto out_gunlock; 186 goto out_gunlock;
187 error = -EFBIG; 187 error = -EFBIG;
188 if (dip->i_di.di_entries == (u32)-1) 188 if (dip->i_entries == (u32)-1)
189 goto out_gunlock; 189 goto out_gunlock;
190 error = -EPERM; 190 error = -EPERM;
191 if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) 191 if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
@@ -371,7 +371,7 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
371 371
372 ip = ghs[1].gh_gl->gl_object; 372 ip = ghs[1].gh_gl->gl_object;
373 373
374 ip->i_di.di_size = size; 374 ip->i_disksize = size;
375 375
376 error = gfs2_meta_inode_buffer(ip, &dibh); 376 error = gfs2_meta_inode_buffer(ip, &dibh);
377 377
@@ -425,9 +425,9 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
425 ip = ghs[1].gh_gl->gl_object; 425 ip = ghs[1].gh_gl->gl_object;
426 426
427 ip->i_inode.i_nlink = 2; 427 ip->i_inode.i_nlink = 2;
428 ip->i_di.di_size = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode); 428 ip->i_disksize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode);
429 ip->i_di.di_flags |= GFS2_DIF_JDATA; 429 ip->i_diskflags |= GFS2_DIF_JDATA;
430 ip->i_di.di_entries = 2; 430 ip->i_entries = 2;
431 431
432 error = gfs2_meta_inode_buffer(ip, &dibh); 432 error = gfs2_meta_inode_buffer(ip, &dibh);
433 433
@@ -517,13 +517,13 @@ static int gfs2_rmdir(struct inode *dir, struct dentry *dentry)
517 if (error) 517 if (error)
518 goto out_gunlock; 518 goto out_gunlock;
519 519
520 if (ip->i_di.di_entries < 2) { 520 if (ip->i_entries < 2) {
521 if (gfs2_consist_inode(ip)) 521 if (gfs2_consist_inode(ip))
522 gfs2_dinode_print(ip); 522 gfs2_dinode_print(ip);
523 error = -EIO; 523 error = -EIO;
524 goto out_gunlock; 524 goto out_gunlock;
525 } 525 }
526 if (ip->i_di.di_entries > 2) { 526 if (ip->i_entries > 2) {
527 error = -ENOTEMPTY; 527 error = -ENOTEMPTY;
528 goto out_gunlock; 528 goto out_gunlock;
529 } 529 }
@@ -726,13 +726,13 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
726 goto out_gunlock; 726 goto out_gunlock;
727 727
728 if (S_ISDIR(nip->i_inode.i_mode)) { 728 if (S_ISDIR(nip->i_inode.i_mode)) {
729 if (nip->i_di.di_entries < 2) { 729 if (nip->i_entries < 2) {
730 if (gfs2_consist_inode(nip)) 730 if (gfs2_consist_inode(nip))
731 gfs2_dinode_print(nip); 731 gfs2_dinode_print(nip);
732 error = -EIO; 732 error = -EIO;
733 goto out_gunlock; 733 goto out_gunlock;
734 } 734 }
735 if (nip->i_di.di_entries > 2) { 735 if (nip->i_entries > 2) {
736 error = -ENOTEMPTY; 736 error = -ENOTEMPTY;
737 goto out_gunlock; 737 goto out_gunlock;
738 } 738 }
@@ -758,7 +758,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
758 error = -EINVAL; 758 error = -EINVAL;
759 goto out_gunlock; 759 goto out_gunlock;
760 } 760 }
761 if (ndip->i_di.di_entries == (u32)-1) { 761 if (ndip->i_entries == (u32)-1) {
762 error = -EFBIG; 762 error = -EFBIG;
763 goto out_gunlock; 763 goto out_gunlock;
764 } 764 }
@@ -990,7 +990,7 @@ static int setattr_size(struct inode *inode, struct iattr *attr)
990 struct gfs2_sbd *sdp = GFS2_SB(inode); 990 struct gfs2_sbd *sdp = GFS2_SB(inode);
991 int error; 991 int error;
992 992
993 if (attr->ia_size != ip->i_di.di_size) { 993 if (attr->ia_size != ip->i_disksize) {
994 error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks); 994 error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
995 if (error) 995 if (error)
996 return error; 996 return error;
@@ -1001,8 +1001,8 @@ static int setattr_size(struct inode *inode, struct iattr *attr)
1001 } 1001 }
1002 1002
1003 error = gfs2_truncatei(ip, attr->ia_size); 1003 error = gfs2_truncatei(ip, attr->ia_size);
1004 if (error && (inode->i_size != ip->i_di.di_size)) 1004 if (error && (inode->i_size != ip->i_disksize))
1005 i_size_write(inode, ip->i_di.di_size); 1005 i_size_write(inode, ip->i_disksize);
1006 1006
1007 return error; 1007 return error;
1008} 1008}
@@ -1212,6 +1212,48 @@ static int gfs2_removexattr(struct dentry *dentry, const char *name)
1212 return gfs2_ea_remove(GFS2_I(dentry->d_inode), &er); 1212 return gfs2_ea_remove(GFS2_I(dentry->d_inode), &er);
1213} 1213}
1214 1214
1215static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
1216 u64 start, u64 len)
1217{
1218 struct gfs2_inode *ip = GFS2_I(inode);
1219 struct gfs2_holder gh;
1220 int ret;
1221
1222 ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC);
1223 if (ret)
1224 return ret;
1225
1226 mutex_lock(&inode->i_mutex);
1227
1228 ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
1229 if (ret)
1230 goto out;
1231
1232 if (gfs2_is_stuffed(ip)) {
1233 u64 phys = ip->i_no_addr << inode->i_blkbits;
1234 u64 size = i_size_read(inode);
1235 u32 flags = FIEMAP_EXTENT_LAST|FIEMAP_EXTENT_NOT_ALIGNED|
1236 FIEMAP_EXTENT_DATA_INLINE;
1237 phys += sizeof(struct gfs2_dinode);
1238 phys += start;
1239 if (start + len > size)
1240 len = size - start;
1241 if (start < size)
1242 ret = fiemap_fill_next_extent(fieinfo, start, phys,
1243 len, flags);
1244 if (ret == 1)
1245 ret = 0;
1246 } else {
1247 ret = __generic_block_fiemap(inode, fieinfo, start, len,
1248 gfs2_block_map);
1249 }
1250
1251 gfs2_glock_dq_uninit(&gh);
1252out:
1253 mutex_unlock(&inode->i_mutex);
1254 return ret;
1255}
1256
1215const struct inode_operations gfs2_file_iops = { 1257const struct inode_operations gfs2_file_iops = {
1216 .permission = gfs2_permission, 1258 .permission = gfs2_permission,
1217 .setattr = gfs2_setattr, 1259 .setattr = gfs2_setattr,
@@ -1220,6 +1262,7 @@ const struct inode_operations gfs2_file_iops = {
1220 .getxattr = gfs2_getxattr, 1262 .getxattr = gfs2_getxattr,
1221 .listxattr = gfs2_listxattr, 1263 .listxattr = gfs2_listxattr,
1222 .removexattr = gfs2_removexattr, 1264 .removexattr = gfs2_removexattr,
1265 .fiemap = gfs2_fiemap,
1223}; 1266};
1224 1267
1225const struct inode_operations gfs2_dir_iops = { 1268const struct inode_operations gfs2_dir_iops = {
@@ -1239,6 +1282,7 @@ const struct inode_operations gfs2_dir_iops = {
1239 .getxattr = gfs2_getxattr, 1282 .getxattr = gfs2_getxattr,
1240 .listxattr = gfs2_listxattr, 1283 .listxattr = gfs2_listxattr,
1241 .removexattr = gfs2_removexattr, 1284 .removexattr = gfs2_removexattr,
1285 .fiemap = gfs2_fiemap,
1242}; 1286};
1243 1287
1244const struct inode_operations gfs2_symlink_iops = { 1288const struct inode_operations gfs2_symlink_iops = {
@@ -1251,5 +1295,6 @@ const struct inode_operations gfs2_symlink_iops = {
1251 .getxattr = gfs2_getxattr, 1295 .getxattr = gfs2_getxattr,
1252 .listxattr = gfs2_listxattr, 1296 .listxattr = gfs2_listxattr,
1253 .removexattr = gfs2_removexattr, 1297 .removexattr = gfs2_removexattr,
1298 .fiemap = gfs2_fiemap,
1254}; 1299};
1255 1300
diff --git a/fs/gfs2/ops_inode.h b/fs/gfs2/ops_inode.h
deleted file mode 100644
index 14b4b797622a..000000000000
--- a/fs/gfs2/ops_inode.h
+++ /dev/null
@@ -1,25 +0,0 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __OPS_INODE_DOT_H__
11#define __OPS_INODE_DOT_H__
12
13#include <linux/fs.h>
14
15extern const struct inode_operations gfs2_file_iops;
16extern const struct inode_operations gfs2_dir_iops;
17extern const struct inode_operations gfs2_symlink_iops;
18extern const struct file_operations gfs2_file_fops;
19extern const struct file_operations gfs2_dir_fops;
20extern const struct file_operations gfs2_file_fops_nolock;
21extern const struct file_operations gfs2_dir_fops_nolock;
22
23extern void gfs2_set_inode_flags(struct inode *inode);
24
25#endif /* __OPS_INODE_DOT_H__ */
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index d5355d9b5926..320323d03479 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -28,7 +28,6 @@
28#include "inode.h" 28#include "inode.h"
29#include "log.h" 29#include "log.h"
30#include "mount.h" 30#include "mount.h"
31#include "ops_super.h"
32#include "quota.h" 31#include "quota.h"
33#include "recovery.h" 32#include "recovery.h"
34#include "rgrp.h" 33#include "rgrp.h"
@@ -143,8 +142,6 @@ static void gfs2_put_super(struct super_block *sb)
143 kthread_stop(sdp->sd_quotad_process); 142 kthread_stop(sdp->sd_quotad_process);
144 kthread_stop(sdp->sd_logd_process); 143 kthread_stop(sdp->sd_logd_process);
145 kthread_stop(sdp->sd_recoverd_process); 144 kthread_stop(sdp->sd_recoverd_process);
146 while (sdp->sd_glockd_num--)
147 kthread_stop(sdp->sd_glockd_process[sdp->sd_glockd_num]);
148 145
149 if (!(sb->s_flags & MS_RDONLY)) { 146 if (!(sb->s_flags & MS_RDONLY)) {
150 error = gfs2_make_fs_ro(sdp); 147 error = gfs2_make_fs_ro(sdp);
@@ -185,7 +182,6 @@ static void gfs2_put_super(struct super_block *sb)
185 182
186 /* At this point, we're through participating in the lockspace */ 183 /* At this point, we're through participating in the lockspace */
187 gfs2_sys_fs_del(sdp); 184 gfs2_sys_fs_del(sdp);
188 kfree(sdp);
189} 185}
190 186
191/** 187/**
@@ -215,18 +211,18 @@ static int gfs2_sync_fs(struct super_block *sb, int wait)
215} 211}
216 212
217/** 213/**
218 * gfs2_write_super_lockfs - prevent further writes to the filesystem 214 * gfs2_freeze - prevent further writes to the filesystem
219 * @sb: the VFS structure for the filesystem 215 * @sb: the VFS structure for the filesystem
220 * 216 *
221 */ 217 */
222 218
223static void gfs2_write_super_lockfs(struct super_block *sb) 219static int gfs2_freeze(struct super_block *sb)
224{ 220{
225 struct gfs2_sbd *sdp = sb->s_fs_info; 221 struct gfs2_sbd *sdp = sb->s_fs_info;
226 int error; 222 int error;
227 223
228 if (test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) 224 if (test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
229 return; 225 return -EINVAL;
230 226
231 for (;;) { 227 for (;;) {
232 error = gfs2_freeze_fs(sdp); 228 error = gfs2_freeze_fs(sdp);
@@ -246,17 +242,150 @@ static void gfs2_write_super_lockfs(struct super_block *sb)
246 fs_err(sdp, "retrying...\n"); 242 fs_err(sdp, "retrying...\n");
247 msleep(1000); 243 msleep(1000);
248 } 244 }
245 return 0;
249} 246}
250 247
251/** 248/**
252 * gfs2_unlockfs - reallow writes to the filesystem 249 * gfs2_unfreeze - reallow writes to the filesystem
253 * @sb: the VFS structure for the filesystem 250 * @sb: the VFS structure for the filesystem
254 * 251 *
255 */ 252 */
256 253
257static void gfs2_unlockfs(struct super_block *sb) 254static int gfs2_unfreeze(struct super_block *sb)
258{ 255{
259 gfs2_unfreeze_fs(sb->s_fs_info); 256 gfs2_unfreeze_fs(sb->s_fs_info);
257 return 0;
258}
259
260/**
261 * statfs_fill - fill in the sg for a given RG
262 * @rgd: the RG
263 * @sc: the sc structure
264 *
265 * Returns: 0 on success, -ESTALE if the LVB is invalid
266 */
267
268static int statfs_slow_fill(struct gfs2_rgrpd *rgd,
269 struct gfs2_statfs_change_host *sc)
270{
271 gfs2_rgrp_verify(rgd);
272 sc->sc_total += rgd->rd_data;
273 sc->sc_free += rgd->rd_free;
274 sc->sc_dinodes += rgd->rd_dinodes;
275 return 0;
276}
277
278/**
279 * gfs2_statfs_slow - Stat a filesystem using asynchronous locking
280 * @sdp: the filesystem
281 * @sc: the sc info that will be returned
282 *
283 * Any error (other than a signal) will cause this routine to fall back
284 * to the synchronous version.
285 *
286 * FIXME: This really shouldn't busy wait like this.
287 *
288 * Returns: errno
289 */
290
291static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
292{
293 struct gfs2_holder ri_gh;
294 struct gfs2_rgrpd *rgd_next;
295 struct gfs2_holder *gha, *gh;
296 unsigned int slots = 64;
297 unsigned int x;
298 int done;
299 int error = 0, err;
300
301 memset(sc, 0, sizeof(struct gfs2_statfs_change_host));
302 gha = kcalloc(slots, sizeof(struct gfs2_holder), GFP_KERNEL);
303 if (!gha)
304 return -ENOMEM;
305
306 error = gfs2_rindex_hold(sdp, &ri_gh);
307 if (error)
308 goto out;
309
310 rgd_next = gfs2_rgrpd_get_first(sdp);
311
312 for (;;) {
313 done = 1;
314
315 for (x = 0; x < slots; x++) {
316 gh = gha + x;
317
318 if (gh->gh_gl && gfs2_glock_poll(gh)) {
319 err = gfs2_glock_wait(gh);
320 if (err) {
321 gfs2_holder_uninit(gh);
322 error = err;
323 } else {
324 if (!error)
325 error = statfs_slow_fill(
326 gh->gh_gl->gl_object, sc);
327 gfs2_glock_dq_uninit(gh);
328 }
329 }
330
331 if (gh->gh_gl)
332 done = 0;
333 else if (rgd_next && !error) {
334 error = gfs2_glock_nq_init(rgd_next->rd_gl,
335 LM_ST_SHARED,
336 GL_ASYNC,
337 gh);
338 rgd_next = gfs2_rgrpd_get_next(rgd_next);
339 done = 0;
340 }
341
342 if (signal_pending(current))
343 error = -ERESTARTSYS;
344 }
345
346 if (done)
347 break;
348
349 yield();
350 }
351
352 gfs2_glock_dq_uninit(&ri_gh);
353
354out:
355 kfree(gha);
356 return error;
357}
358
359/**
360 * gfs2_statfs_i - Do a statfs
361 * @sdp: the filesystem
362 * @sg: the sg structure
363 *
364 * Returns: errno
365 */
366
367static int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
368{
369 struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
370 struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
371
372 spin_lock(&sdp->sd_statfs_spin);
373
374 *sc = *m_sc;
375 sc->sc_total += l_sc->sc_total;
376 sc->sc_free += l_sc->sc_free;
377 sc->sc_dinodes += l_sc->sc_dinodes;
378
379 spin_unlock(&sdp->sd_statfs_spin);
380
381 if (sc->sc_free < 0)
382 sc->sc_free = 0;
383 if (sc->sc_free > sc->sc_total)
384 sc->sc_free = sc->sc_total;
385 if (sc->sc_dinodes < 0)
386 sc->sc_dinodes = 0;
387
388 return 0;
260} 389}
261 390
262/** 391/**
@@ -370,7 +499,6 @@ static void gfs2_clear_inode(struct inode *inode)
370 */ 499 */
371 if (test_bit(GIF_USER, &ip->i_flags)) { 500 if (test_bit(GIF_USER, &ip->i_flags)) {
372 ip->i_gl->gl_object = NULL; 501 ip->i_gl->gl_object = NULL;
373 gfs2_glock_schedule_for_reclaim(ip->i_gl);
374 gfs2_glock_put(ip->i_gl); 502 gfs2_glock_put(ip->i_gl);
375 ip->i_gl = NULL; 503 ip->i_gl = NULL;
376 if (ip->i_iopen_gh.gh_gl) { 504 if (ip->i_iopen_gh.gh_gl) {
@@ -423,8 +551,6 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
423 seq_printf(s, ",debug"); 551 seq_printf(s, ",debug");
424 if (args->ar_upgrade) 552 if (args->ar_upgrade)
425 seq_printf(s, ",upgrade"); 553 seq_printf(s, ",upgrade");
426 if (args->ar_num_glockd != GFS2_GLOCKD_DEFAULT)
427 seq_printf(s, ",num_glockd=%u", args->ar_num_glockd);
428 if (args->ar_posix_acl) 554 if (args->ar_posix_acl)
429 seq_printf(s, ",acl"); 555 seq_printf(s, ",acl");
430 if (args->ar_quota != GFS2_QUOTA_DEFAULT) { 556 if (args->ar_quota != GFS2_QUOTA_DEFAULT) {
@@ -494,16 +620,16 @@ static void gfs2_delete_inode(struct inode *inode)
494 gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh); 620 gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh);
495 error = gfs2_glock_nq(&ip->i_iopen_gh); 621 error = gfs2_glock_nq(&ip->i_iopen_gh);
496 if (error) 622 if (error)
497 goto out_uninit; 623 goto out_truncate;
498 624
499 if (S_ISDIR(inode->i_mode) && 625 if (S_ISDIR(inode->i_mode) &&
500 (ip->i_di.di_flags & GFS2_DIF_EXHASH)) { 626 (ip->i_diskflags & GFS2_DIF_EXHASH)) {
501 error = gfs2_dir_exhash_dealloc(ip); 627 error = gfs2_dir_exhash_dealloc(ip);
502 if (error) 628 if (error)
503 goto out_unlock; 629 goto out_unlock;
504 } 630 }
505 631
506 if (ip->i_di.di_eattr) { 632 if (ip->i_eattr) {
507 error = gfs2_ea_dealloc(ip); 633 error = gfs2_ea_dealloc(ip);
508 if (error) 634 if (error)
509 goto out_unlock; 635 goto out_unlock;
@@ -519,6 +645,7 @@ static void gfs2_delete_inode(struct inode *inode)
519 if (error) 645 if (error)
520 goto out_unlock; 646 goto out_unlock;
521 647
648out_truncate:
522 error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks); 649 error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
523 if (error) 650 if (error)
524 goto out_unlock; 651 goto out_unlock;
@@ -527,8 +654,8 @@ static void gfs2_delete_inode(struct inode *inode)
527 gfs2_trans_end(sdp); 654 gfs2_trans_end(sdp);
528 655
529out_unlock: 656out_unlock:
530 gfs2_glock_dq(&ip->i_iopen_gh); 657 if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags))
531out_uninit: 658 gfs2_glock_dq(&ip->i_iopen_gh);
532 gfs2_holder_uninit(&ip->i_iopen_gh); 659 gfs2_holder_uninit(&ip->i_iopen_gh);
533 gfs2_glock_dq_uninit(&gh); 660 gfs2_glock_dq_uninit(&gh);
534 if (error && error != GLR_TRYFAILED) 661 if (error && error != GLR_TRYFAILED)
@@ -563,8 +690,8 @@ const struct super_operations gfs2_super_ops = {
563 .put_super = gfs2_put_super, 690 .put_super = gfs2_put_super,
564 .write_super = gfs2_write_super, 691 .write_super = gfs2_write_super,
565 .sync_fs = gfs2_sync_fs, 692 .sync_fs = gfs2_sync_fs,
566 .write_super_lockfs = gfs2_write_super_lockfs, 693 .freeze_fs = gfs2_freeze,
567 .unlockfs = gfs2_unlockfs, 694 .unfreeze_fs = gfs2_unfreeze,
568 .statfs = gfs2_statfs, 695 .statfs = gfs2_statfs,
569 .remount_fs = gfs2_remount_fs, 696 .remount_fs = gfs2_remount_fs,
570 .clear_inode = gfs2_clear_inode, 697 .clear_inode = gfs2_clear_inode,
diff --git a/fs/gfs2/ops_super.h b/fs/gfs2/ops_super.h
deleted file mode 100644
index 442a274c6272..000000000000
--- a/fs/gfs2/ops_super.h
+++ /dev/null
@@ -1,17 +0,0 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __OPS_SUPER_DOT_H__
11#define __OPS_SUPER_DOT_H__
12
13#include <linux/fs.h>
14
15extern const struct super_operations gfs2_super_ops;
16
17#endif /* __OPS_SUPER_DOT_H__ */
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 3e073f5144fa..b08d09696b3e 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -46,6 +46,8 @@
46#include <linux/bio.h> 46#include <linux/bio.h>
47#include <linux/gfs2_ondisk.h> 47#include <linux/gfs2_ondisk.h>
48#include <linux/lm_interface.h> 48#include <linux/lm_interface.h>
49#include <linux/kthread.h>
50#include <linux/freezer.h>
49 51
50#include "gfs2.h" 52#include "gfs2.h"
51#include "incore.h" 53#include "incore.h"
@@ -94,7 +96,7 @@ static int qd_alloc(struct gfs2_sbd *sdp, int user, u32 id,
94 struct gfs2_quota_data *qd; 96 struct gfs2_quota_data *qd;
95 int error; 97 int error;
96 98
97 qd = kzalloc(sizeof(struct gfs2_quota_data), GFP_NOFS); 99 qd = kmem_cache_zalloc(gfs2_quotad_cachep, GFP_NOFS);
98 if (!qd) 100 if (!qd)
99 return -ENOMEM; 101 return -ENOMEM;
100 102
@@ -119,7 +121,7 @@ static int qd_alloc(struct gfs2_sbd *sdp, int user, u32 id,
119 return 0; 121 return 0;
120 122
121fail: 123fail:
122 kfree(qd); 124 kmem_cache_free(gfs2_quotad_cachep, qd);
123 return error; 125 return error;
124} 126}
125 127
@@ -158,7 +160,7 @@ static int qd_get(struct gfs2_sbd *sdp, int user, u32 id, int create,
158 if (qd || !create) { 160 if (qd || !create) {
159 if (new_qd) { 161 if (new_qd) {
160 gfs2_lvb_unhold(new_qd->qd_gl); 162 gfs2_lvb_unhold(new_qd->qd_gl);
161 kfree(new_qd); 163 kmem_cache_free(gfs2_quotad_cachep, new_qd);
162 } 164 }
163 *qdp = qd; 165 *qdp = qd;
164 return 0; 166 return 0;
@@ -1013,7 +1015,7 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
1013 1015
1014 if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), change)) 1016 if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), change))
1015 return; 1017 return;
1016 if (ip->i_di.di_flags & GFS2_DIF_SYSTEM) 1018 if (ip->i_diskflags & GFS2_DIF_SYSTEM)
1017 return; 1019 return;
1018 1020
1019 for (x = 0; x < al->al_qd_num; x++) { 1021 for (x = 0; x < al->al_qd_num; x++) {
@@ -1100,15 +1102,15 @@ static void gfs2_quota_change_in(struct gfs2_quota_change_host *qc, const void *
1100int gfs2_quota_init(struct gfs2_sbd *sdp) 1102int gfs2_quota_init(struct gfs2_sbd *sdp)
1101{ 1103{
1102 struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode); 1104 struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
1103 unsigned int blocks = ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift; 1105 unsigned int blocks = ip->i_disksize >> sdp->sd_sb.sb_bsize_shift;
1104 unsigned int x, slot = 0; 1106 unsigned int x, slot = 0;
1105 unsigned int found = 0; 1107 unsigned int found = 0;
1106 u64 dblock; 1108 u64 dblock;
1107 u32 extlen = 0; 1109 u32 extlen = 0;
1108 int error; 1110 int error;
1109 1111
1110 if (!ip->i_di.di_size || ip->i_di.di_size > (64 << 20) || 1112 if (!ip->i_disksize || ip->i_disksize > (64 << 20) ||
1111 ip->i_di.di_size & (sdp->sd_sb.sb_bsize - 1)) { 1113 ip->i_disksize & (sdp->sd_sb.sb_bsize - 1)) {
1112 gfs2_consist_inode(ip); 1114 gfs2_consist_inode(ip);
1113 return -EIO; 1115 return -EIO;
1114 } 1116 }
@@ -1195,7 +1197,7 @@ fail:
1195 return error; 1197 return error;
1196} 1198}
1197 1199
1198void gfs2_quota_scan(struct gfs2_sbd *sdp) 1200static void gfs2_quota_scan(struct gfs2_sbd *sdp)
1199{ 1201{
1200 struct gfs2_quota_data *qd, *safe; 1202 struct gfs2_quota_data *qd, *safe;
1201 LIST_HEAD(dead); 1203 LIST_HEAD(dead);
@@ -1222,7 +1224,7 @@ void gfs2_quota_scan(struct gfs2_sbd *sdp)
1222 gfs2_assert_warn(sdp, !qd->qd_bh_count); 1224 gfs2_assert_warn(sdp, !qd->qd_bh_count);
1223 1225
1224 gfs2_lvb_unhold(qd->qd_gl); 1226 gfs2_lvb_unhold(qd->qd_gl);
1225 kfree(qd); 1227 kmem_cache_free(gfs2_quotad_cachep, qd);
1226 } 1228 }
1227} 1229}
1228 1230
@@ -1257,7 +1259,7 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
1257 gfs2_assert_warn(sdp, !qd->qd_bh_count); 1259 gfs2_assert_warn(sdp, !qd->qd_bh_count);
1258 1260
1259 gfs2_lvb_unhold(qd->qd_gl); 1261 gfs2_lvb_unhold(qd->qd_gl);
1260 kfree(qd); 1262 kmem_cache_free(gfs2_quotad_cachep, qd);
1261 1263
1262 spin_lock(&sdp->sd_quota_spin); 1264 spin_lock(&sdp->sd_quota_spin);
1263 } 1265 }
@@ -1272,3 +1274,94 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
1272 } 1274 }
1273} 1275}
1274 1276
1277static void quotad_error(struct gfs2_sbd *sdp, const char *msg, int error)
1278{
1279 if (error == 0 || error == -EROFS)
1280 return;
1281 if (!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
1282 fs_err(sdp, "gfs2_quotad: %s error %d\n", msg, error);
1283}
1284
1285static void quotad_check_timeo(struct gfs2_sbd *sdp, const char *msg,
1286 int (*fxn)(struct gfs2_sbd *sdp),
1287 unsigned long t, unsigned long *timeo,
1288 unsigned int *new_timeo)
1289{
1290 if (t >= *timeo) {
1291 int error = fxn(sdp);
1292 quotad_error(sdp, msg, error);
1293 *timeo = gfs2_tune_get_i(&sdp->sd_tune, new_timeo) * HZ;
1294 } else {
1295 *timeo -= t;
1296 }
1297}
1298
1299static void quotad_check_trunc_list(struct gfs2_sbd *sdp)
1300{
1301 struct gfs2_inode *ip;
1302
1303 while(1) {
1304 ip = NULL;
1305 spin_lock(&sdp->sd_trunc_lock);
1306 if (!list_empty(&sdp->sd_trunc_list)) {
1307 ip = list_entry(sdp->sd_trunc_list.next,
1308 struct gfs2_inode, i_trunc_list);
1309 list_del_init(&ip->i_trunc_list);
1310 }
1311 spin_unlock(&sdp->sd_trunc_lock);
1312 if (ip == NULL)
1313 return;
1314 gfs2_glock_finish_truncate(ip);
1315 }
1316}
1317
1318/**
1319 * gfs2_quotad - Write cached quota changes into the quota file
1320 * @sdp: Pointer to GFS2 superblock
1321 *
1322 */
1323
1324int gfs2_quotad(void *data)
1325{
1326 struct gfs2_sbd *sdp = data;
1327 struct gfs2_tune *tune = &sdp->sd_tune;
1328 unsigned long statfs_timeo = 0;
1329 unsigned long quotad_timeo = 0;
1330 unsigned long t = 0;
1331 DEFINE_WAIT(wait);
1332 int empty;
1333
1334 while (!kthread_should_stop()) {
1335
1336 /* Update the master statfs file */
1337 quotad_check_timeo(sdp, "statfs", gfs2_statfs_sync, t,
1338 &statfs_timeo, &tune->gt_statfs_quantum);
1339
1340 /* Update quota file */
1341 quotad_check_timeo(sdp, "sync", gfs2_quota_sync, t,
1342 &quotad_timeo, &tune->gt_quota_quantum);
1343
1344 /* FIXME: This should be turned into a shrinker */
1345 gfs2_quota_scan(sdp);
1346
1347 /* Check for & recover partially truncated inodes */
1348 quotad_check_trunc_list(sdp);
1349
1350 if (freezing(current))
1351 refrigerator();
1352 t = min(quotad_timeo, statfs_timeo);
1353
1354 prepare_to_wait(&sdp->sd_quota_wait, &wait, TASK_UNINTERRUPTIBLE);
1355 spin_lock(&sdp->sd_trunc_lock);
1356 empty = list_empty(&sdp->sd_trunc_list);
1357 spin_unlock(&sdp->sd_trunc_lock);
1358 if (empty)
1359 t -= schedule_timeout(t);
1360 else
1361 t = 0;
1362 finish_wait(&sdp->sd_quota_wait, &wait);
1363 }
1364
1365 return 0;
1366}
1367
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index 3b7f4b0e5dfe..cec9032be97d 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -15,22 +15,22 @@ struct gfs2_sbd;
15 15
16#define NO_QUOTA_CHANGE ((u32)-1) 16#define NO_QUOTA_CHANGE ((u32)-1)
17 17
18int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid); 18extern int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid);
19void gfs2_quota_unhold(struct gfs2_inode *ip); 19extern void gfs2_quota_unhold(struct gfs2_inode *ip);
20 20
21int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid); 21extern int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid);
22void gfs2_quota_unlock(struct gfs2_inode *ip); 22extern void gfs2_quota_unlock(struct gfs2_inode *ip);
23 23
24int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid); 24extern int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid);
25void gfs2_quota_change(struct gfs2_inode *ip, s64 change, 25extern void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
26 u32 uid, u32 gid); 26 u32 uid, u32 gid);
27 27
28int gfs2_quota_sync(struct gfs2_sbd *sdp); 28extern int gfs2_quota_sync(struct gfs2_sbd *sdp);
29int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id); 29extern int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id);
30 30
31int gfs2_quota_init(struct gfs2_sbd *sdp); 31extern int gfs2_quota_init(struct gfs2_sbd *sdp);
32void gfs2_quota_scan(struct gfs2_sbd *sdp); 32extern void gfs2_quota_cleanup(struct gfs2_sbd *sdp);
33void gfs2_quota_cleanup(struct gfs2_sbd *sdp); 33extern int gfs2_quotad(void *data);
34 34
35static inline int gfs2_quota_lock_check(struct gfs2_inode *ip) 35static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
36{ 36{
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index d5e91f4f6a0b..efd09c3d2b26 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -14,6 +14,8 @@
14#include <linux/gfs2_ondisk.h> 14#include <linux/gfs2_ondisk.h>
15#include <linux/crc32.h> 15#include <linux/crc32.h>
16#include <linux/lm_interface.h> 16#include <linux/lm_interface.h>
17#include <linux/kthread.h>
18#include <linux/freezer.h>
17 19
18#include "gfs2.h" 20#include "gfs2.h"
19#include "incore.h" 21#include "incore.h"
@@ -583,13 +585,35 @@ fail:
583 return error; 585 return error;
584} 586}
585 587
588static struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp)
589{
590 struct gfs2_jdesc *jd;
591 int found = 0;
592
593 spin_lock(&sdp->sd_jindex_spin);
594
595 list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
596 if (jd->jd_dirty) {
597 jd->jd_dirty = 0;
598 found = 1;
599 break;
600 }
601 }
602 spin_unlock(&sdp->sd_jindex_spin);
603
604 if (!found)
605 jd = NULL;
606
607 return jd;
608}
609
586/** 610/**
587 * gfs2_check_journals - Recover any dirty journals 611 * gfs2_check_journals - Recover any dirty journals
588 * @sdp: the filesystem 612 * @sdp: the filesystem
589 * 613 *
590 */ 614 */
591 615
592void gfs2_check_journals(struct gfs2_sbd *sdp) 616static void gfs2_check_journals(struct gfs2_sbd *sdp)
593{ 617{
594 struct gfs2_jdesc *jd; 618 struct gfs2_jdesc *jd;
595 619
@@ -603,3 +627,25 @@ void gfs2_check_journals(struct gfs2_sbd *sdp)
603 } 627 }
604} 628}
605 629
630/**
631 * gfs2_recoverd - Recover dead machine's journals
632 * @sdp: Pointer to GFS2 superblock
633 *
634 */
635
636int gfs2_recoverd(void *data)
637{
638 struct gfs2_sbd *sdp = data;
639 unsigned long t;
640
641 while (!kthread_should_stop()) {
642 gfs2_check_journals(sdp);
643 t = gfs2_tune_get(sdp, gt_recoverd_secs) * HZ;
644 if (freezing(current))
645 refrigerator();
646 schedule_timeout_interruptible(t);
647 }
648
649 return 0;
650}
651
diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h
index f7235e61c723..a8218ea15b57 100644
--- a/fs/gfs2/recovery.h
+++ b/fs/gfs2/recovery.h
@@ -18,17 +18,17 @@ static inline void gfs2_replay_incr_blk(struct gfs2_sbd *sdp, unsigned int *blk)
18 *blk = 0; 18 *blk = 0;
19} 19}
20 20
21int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk, 21extern int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
22 struct buffer_head **bh); 22 struct buffer_head **bh);
23 23
24int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where); 24extern int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where);
25int gfs2_revoke_check(struct gfs2_sbd *sdp, u64 blkno, unsigned int where); 25extern int gfs2_revoke_check(struct gfs2_sbd *sdp, u64 blkno, unsigned int where);
26void gfs2_revoke_clean(struct gfs2_sbd *sdp); 26extern void gfs2_revoke_clean(struct gfs2_sbd *sdp);
27 27
28int gfs2_find_jhead(struct gfs2_jdesc *jd, 28extern int gfs2_find_jhead(struct gfs2_jdesc *jd,
29 struct gfs2_log_header_host *head); 29 struct gfs2_log_header_host *head);
30int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd); 30extern int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd);
31void gfs2_check_journals(struct gfs2_sbd *sdp); 31extern int gfs2_recoverd(void *data);
32 32
33#endif /* __RECOVERY_DOT_H__ */ 33#endif /* __RECOVERY_DOT_H__ */
34 34
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 2d90fb253505..8b01c635d925 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -269,16 +269,14 @@ void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
269 bi->bi_len, x); 269 bi->bi_len, x);
270 } 270 }
271 271
272 if (count[0] != rgd->rd_rg.rg_free) { 272 if (count[0] != rgd->rd_free) {
273 if (gfs2_consist_rgrpd(rgd)) 273 if (gfs2_consist_rgrpd(rgd))
274 fs_err(sdp, "free data mismatch: %u != %u\n", 274 fs_err(sdp, "free data mismatch: %u != %u\n",
275 count[0], rgd->rd_rg.rg_free); 275 count[0], rgd->rd_free);
276 return; 276 return;
277 } 277 }
278 278
279 tmp = rgd->rd_data - 279 tmp = rgd->rd_data - rgd->rd_free - rgd->rd_dinodes;
280 rgd->rd_rg.rg_free -
281 rgd->rd_rg.rg_dinodes;
282 if (count[1] + count[2] != tmp) { 280 if (count[1] + count[2] != tmp) {
283 if (gfs2_consist_rgrpd(rgd)) 281 if (gfs2_consist_rgrpd(rgd))
284 fs_err(sdp, "used data mismatch: %u != %u\n", 282 fs_err(sdp, "used data mismatch: %u != %u\n",
@@ -286,10 +284,10 @@ void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
286 return; 284 return;
287 } 285 }
288 286
289 if (count[3] != rgd->rd_rg.rg_dinodes) { 287 if (count[3] != rgd->rd_dinodes) {
290 if (gfs2_consist_rgrpd(rgd)) 288 if (gfs2_consist_rgrpd(rgd))
291 fs_err(sdp, "used metadata mismatch: %u != %u\n", 289 fs_err(sdp, "used metadata mismatch: %u != %u\n",
292 count[3], rgd->rd_rg.rg_dinodes); 290 count[3], rgd->rd_dinodes);
293 return; 291 return;
294 } 292 }
295 293
@@ -501,7 +499,7 @@ u64 gfs2_ri_total(struct gfs2_sbd *sdp)
501 for (rgrps = 0;; rgrps++) { 499 for (rgrps = 0;; rgrps++) {
502 loff_t pos = rgrps * sizeof(struct gfs2_rindex); 500 loff_t pos = rgrps * sizeof(struct gfs2_rindex);
503 501
504 if (pos + sizeof(struct gfs2_rindex) >= ip->i_di.di_size) 502 if (pos + sizeof(struct gfs2_rindex) >= ip->i_disksize)
505 break; 503 break;
506 error = gfs2_internal_read(ip, &ra_state, buf, &pos, 504 error = gfs2_internal_read(ip, &ra_state, buf, &pos,
507 sizeof(struct gfs2_rindex)); 505 sizeof(struct gfs2_rindex));
@@ -590,7 +588,7 @@ static int gfs2_ri_update(struct gfs2_inode *ip)
590 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 588 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
591 struct inode *inode = &ip->i_inode; 589 struct inode *inode = &ip->i_inode;
592 struct file_ra_state ra_state; 590 struct file_ra_state ra_state;
593 u64 rgrp_count = ip->i_di.di_size; 591 u64 rgrp_count = ip->i_disksize;
594 int error; 592 int error;
595 593
596 if (do_div(rgrp_count, sizeof(struct gfs2_rindex))) { 594 if (do_div(rgrp_count, sizeof(struct gfs2_rindex))) {
@@ -634,7 +632,7 @@ static int gfs2_ri_update_special(struct gfs2_inode *ip)
634 for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) { 632 for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) {
635 /* Ignore partials */ 633 /* Ignore partials */
636 if ((sdp->sd_rgrps + 1) * sizeof(struct gfs2_rindex) > 634 if ((sdp->sd_rgrps + 1) * sizeof(struct gfs2_rindex) >
637 ip->i_di.di_size) 635 ip->i_disksize)
638 break; 636 break;
639 error = read_rindex_entry(ip, &ra_state); 637 error = read_rindex_entry(ip, &ra_state);
640 if (error) { 638 if (error) {
@@ -692,7 +690,6 @@ int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh)
692static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf) 690static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
693{ 691{
694 const struct gfs2_rgrp *str = buf; 692 const struct gfs2_rgrp *str = buf;
695 struct gfs2_rgrp_host *rg = &rgd->rd_rg;
696 u32 rg_flags; 693 u32 rg_flags;
697 694
698 rg_flags = be32_to_cpu(str->rg_flags); 695 rg_flags = be32_to_cpu(str->rg_flags);
@@ -700,24 +697,23 @@ static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
700 rgd->rd_flags |= GFS2_RDF_NOALLOC; 697 rgd->rd_flags |= GFS2_RDF_NOALLOC;
701 else 698 else
702 rgd->rd_flags &= ~GFS2_RDF_NOALLOC; 699 rgd->rd_flags &= ~GFS2_RDF_NOALLOC;
703 rg->rg_free = be32_to_cpu(str->rg_free); 700 rgd->rd_free = be32_to_cpu(str->rg_free);
704 rg->rg_dinodes = be32_to_cpu(str->rg_dinodes); 701 rgd->rd_dinodes = be32_to_cpu(str->rg_dinodes);
705 rg->rg_igeneration = be64_to_cpu(str->rg_igeneration); 702 rgd->rd_igeneration = be64_to_cpu(str->rg_igeneration);
706} 703}
707 704
708static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf) 705static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
709{ 706{
710 struct gfs2_rgrp *str = buf; 707 struct gfs2_rgrp *str = buf;
711 struct gfs2_rgrp_host *rg = &rgd->rd_rg;
712 u32 rg_flags = 0; 708 u32 rg_flags = 0;
713 709
714 if (rgd->rd_flags & GFS2_RDF_NOALLOC) 710 if (rgd->rd_flags & GFS2_RDF_NOALLOC)
715 rg_flags |= GFS2_RGF_NOALLOC; 711 rg_flags |= GFS2_RGF_NOALLOC;
716 str->rg_flags = cpu_to_be32(rg_flags); 712 str->rg_flags = cpu_to_be32(rg_flags);
717 str->rg_free = cpu_to_be32(rg->rg_free); 713 str->rg_free = cpu_to_be32(rgd->rd_free);
718 str->rg_dinodes = cpu_to_be32(rg->rg_dinodes); 714 str->rg_dinodes = cpu_to_be32(rgd->rd_dinodes);
719 str->__pad = cpu_to_be32(0); 715 str->__pad = cpu_to_be32(0);
720 str->rg_igeneration = cpu_to_be64(rg->rg_igeneration); 716 str->rg_igeneration = cpu_to_be64(rgd->rd_igeneration);
721 memset(&str->rg_reserved, 0, sizeof(str->rg_reserved)); 717 memset(&str->rg_reserved, 0, sizeof(str->rg_reserved));
722} 718}
723 719
@@ -776,7 +772,7 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
776 } 772 }
777 773
778 spin_lock(&sdp->sd_rindex_spin); 774 spin_lock(&sdp->sd_rindex_spin);
779 rgd->rd_free_clone = rgd->rd_rg.rg_free; 775 rgd->rd_free_clone = rgd->rd_free;
780 rgd->rd_bh_count++; 776 rgd->rd_bh_count++;
781 spin_unlock(&sdp->sd_rindex_spin); 777 spin_unlock(&sdp->sd_rindex_spin);
782 778
@@ -850,7 +846,7 @@ void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd)
850 } 846 }
851 847
852 spin_lock(&sdp->sd_rindex_spin); 848 spin_lock(&sdp->sd_rindex_spin);
853 rgd->rd_free_clone = rgd->rd_rg.rg_free; 849 rgd->rd_free_clone = rgd->rd_free;
854 spin_unlock(&sdp->sd_rindex_spin); 850 spin_unlock(&sdp->sd_rindex_spin);
855} 851}
856 852
@@ -1403,8 +1399,8 @@ u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n)
1403 block = rgd->rd_data0 + blk; 1399 block = rgd->rd_data0 + blk;
1404 ip->i_goal = block; 1400 ip->i_goal = block;
1405 1401
1406 gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free >= *n); 1402 gfs2_assert_withdraw(sdp, rgd->rd_free >= *n);
1407 rgd->rd_rg.rg_free -= *n; 1403 rgd->rd_free -= *n;
1408 1404
1409 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); 1405 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1410 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); 1406 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
@@ -1445,10 +1441,10 @@ u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
1445 1441
1446 block = rgd->rd_data0 + blk; 1442 block = rgd->rd_data0 + blk;
1447 1443
1448 gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free); 1444 gfs2_assert_withdraw(sdp, rgd->rd_free);
1449 rgd->rd_rg.rg_free--; 1445 rgd->rd_free--;
1450 rgd->rd_rg.rg_dinodes++; 1446 rgd->rd_dinodes++;
1451 *generation = rgd->rd_rg.rg_igeneration++; 1447 *generation = rgd->rd_igeneration++;
1452 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); 1448 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1453 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); 1449 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
1454 1450
@@ -1481,7 +1477,7 @@ void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen)
1481 if (!rgd) 1477 if (!rgd)
1482 return; 1478 return;
1483 1479
1484 rgd->rd_rg.rg_free += blen; 1480 rgd->rd_free += blen;
1485 1481
1486 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); 1482 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1487 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); 1483 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
@@ -1509,7 +1505,7 @@ void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen)
1509 if (!rgd) 1505 if (!rgd)
1510 return; 1506 return;
1511 1507
1512 rgd->rd_rg.rg_free += blen; 1508 rgd->rd_free += blen;
1513 1509
1514 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); 1510 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1515 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); 1511 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
@@ -1546,10 +1542,10 @@ static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno)
1546 return; 1542 return;
1547 gfs2_assert_withdraw(sdp, rgd == tmp_rgd); 1543 gfs2_assert_withdraw(sdp, rgd == tmp_rgd);
1548 1544
1549 if (!rgd->rd_rg.rg_dinodes) 1545 if (!rgd->rd_dinodes)
1550 gfs2_consist_rgrpd(rgd); 1546 gfs2_consist_rgrpd(rgd);
1551 rgd->rd_rg.rg_dinodes--; 1547 rgd->rd_dinodes--;
1552 rgd->rd_rg.rg_free++; 1548 rgd->rd_free++;
1553 1549
1554 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); 1550 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1555 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); 1551 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index c3ba3d9d0aac..141b781f2fcc 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -34,76 +34,6 @@
34#include "util.h" 34#include "util.h"
35 35
36/** 36/**
37 * gfs2_jindex_hold - Grab a lock on the jindex
38 * @sdp: The GFS2 superblock
39 * @ji_gh: the holder for the jindex glock
40 *
41 * This is very similar to the gfs2_rindex_hold() function, except that
42 * in general we hold the jindex lock for longer periods of time and
43 * we grab it far less frequently (in general) then the rgrp lock.
44 *
45 * Returns: errno
46 */
47
48int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
49{
50 struct gfs2_inode *dip = GFS2_I(sdp->sd_jindex);
51 struct qstr name;
52 char buf[20];
53 struct gfs2_jdesc *jd;
54 int error;
55
56 name.name = buf;
57
58 mutex_lock(&sdp->sd_jindex_mutex);
59
60 for (;;) {
61 error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, ji_gh);
62 if (error)
63 break;
64
65 name.len = sprintf(buf, "journal%u", sdp->sd_journals);
66 name.hash = gfs2_disk_hash(name.name, name.len);
67
68 error = gfs2_dir_check(sdp->sd_jindex, &name, NULL);
69 if (error == -ENOENT) {
70 error = 0;
71 break;
72 }
73
74 gfs2_glock_dq_uninit(ji_gh);
75
76 if (error)
77 break;
78
79 error = -ENOMEM;
80 jd = kzalloc(sizeof(struct gfs2_jdesc), GFP_KERNEL);
81 if (!jd)
82 break;
83
84 INIT_LIST_HEAD(&jd->extent_list);
85 jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1);
86 if (!jd->jd_inode || IS_ERR(jd->jd_inode)) {
87 if (!jd->jd_inode)
88 error = -ENOENT;
89 else
90 error = PTR_ERR(jd->jd_inode);
91 kfree(jd);
92 break;
93 }
94
95 spin_lock(&sdp->sd_jindex_spin);
96 jd->jd_jid = sdp->sd_journals++;
97 list_add_tail(&jd->jd_list, &sdp->sd_jindex_list);
98 spin_unlock(&sdp->sd_jindex_spin);
99 }
100
101 mutex_unlock(&sdp->sd_jindex_mutex);
102
103 return error;
104}
105
106/**
107 * gfs2_jindex_free - Clear all the journal index information 37 * gfs2_jindex_free - Clear all the journal index information
108 * @sdp: The GFS2 superblock 38 * @sdp: The GFS2 superblock
109 * 39 *
@@ -166,39 +96,6 @@ struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid)
166 return jd; 96 return jd;
167} 97}
168 98
169void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid)
170{
171 struct gfs2_jdesc *jd;
172
173 spin_lock(&sdp->sd_jindex_spin);
174 jd = jdesc_find_i(&sdp->sd_jindex_list, jid);
175 if (jd)
176 jd->jd_dirty = 1;
177 spin_unlock(&sdp->sd_jindex_spin);
178}
179
180struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp)
181{
182 struct gfs2_jdesc *jd;
183 int found = 0;
184
185 spin_lock(&sdp->sd_jindex_spin);
186
187 list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
188 if (jd->jd_dirty) {
189 jd->jd_dirty = 0;
190 found = 1;
191 break;
192 }
193 }
194 spin_unlock(&sdp->sd_jindex_spin);
195
196 if (!found)
197 jd = NULL;
198
199 return jd;
200}
201
202int gfs2_jdesc_check(struct gfs2_jdesc *jd) 99int gfs2_jdesc_check(struct gfs2_jdesc *jd)
203{ 100{
204 struct gfs2_inode *ip = GFS2_I(jd->jd_inode); 101 struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
@@ -206,14 +103,14 @@ int gfs2_jdesc_check(struct gfs2_jdesc *jd)
206 int ar; 103 int ar;
207 int error; 104 int error;
208 105
209 if (ip->i_di.di_size < (8 << 20) || ip->i_di.di_size > (1 << 30) || 106 if (ip->i_disksize < (8 << 20) || ip->i_disksize > (1 << 30) ||
210 (ip->i_di.di_size & (sdp->sd_sb.sb_bsize - 1))) { 107 (ip->i_disksize & (sdp->sd_sb.sb_bsize - 1))) {
211 gfs2_consist_inode(ip); 108 gfs2_consist_inode(ip);
212 return -EIO; 109 return -EIO;
213 } 110 }
214 jd->jd_blocks = ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift; 111 jd->jd_blocks = ip->i_disksize >> sdp->sd_sb.sb_bsize_shift;
215 112
216 error = gfs2_write_alloc_required(ip, 0, ip->i_di.di_size, &ar); 113 error = gfs2_write_alloc_required(ip, 0, ip->i_disksize, &ar);
217 if (!error && ar) { 114 if (!error && ar) {
218 gfs2_consist_inode(ip); 115 gfs2_consist_inode(ip);
219 error = -EIO; 116 error = -EIO;
@@ -423,137 +320,6 @@ out:
423 return error; 320 return error;
424} 321}
425 322
426/**
427 * gfs2_statfs_i - Do a statfs
428 * @sdp: the filesystem
429 * @sg: the sg structure
430 *
431 * Returns: errno
432 */
433
434int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
435{
436 struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
437 struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
438
439 spin_lock(&sdp->sd_statfs_spin);
440
441 *sc = *m_sc;
442 sc->sc_total += l_sc->sc_total;
443 sc->sc_free += l_sc->sc_free;
444 sc->sc_dinodes += l_sc->sc_dinodes;
445
446 spin_unlock(&sdp->sd_statfs_spin);
447
448 if (sc->sc_free < 0)
449 sc->sc_free = 0;
450 if (sc->sc_free > sc->sc_total)
451 sc->sc_free = sc->sc_total;
452 if (sc->sc_dinodes < 0)
453 sc->sc_dinodes = 0;
454
455 return 0;
456}
457
458/**
459 * statfs_fill - fill in the sg for a given RG
460 * @rgd: the RG
461 * @sc: the sc structure
462 *
463 * Returns: 0 on success, -ESTALE if the LVB is invalid
464 */
465
466static int statfs_slow_fill(struct gfs2_rgrpd *rgd,
467 struct gfs2_statfs_change_host *sc)
468{
469 gfs2_rgrp_verify(rgd);
470 sc->sc_total += rgd->rd_data;
471 sc->sc_free += rgd->rd_rg.rg_free;
472 sc->sc_dinodes += rgd->rd_rg.rg_dinodes;
473 return 0;
474}
475
476/**
477 * gfs2_statfs_slow - Stat a filesystem using asynchronous locking
478 * @sdp: the filesystem
479 * @sc: the sc info that will be returned
480 *
481 * Any error (other than a signal) will cause this routine to fall back
482 * to the synchronous version.
483 *
484 * FIXME: This really shouldn't busy wait like this.
485 *
486 * Returns: errno
487 */
488
489int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
490{
491 struct gfs2_holder ri_gh;
492 struct gfs2_rgrpd *rgd_next;
493 struct gfs2_holder *gha, *gh;
494 unsigned int slots = 64;
495 unsigned int x;
496 int done;
497 int error = 0, err;
498
499 memset(sc, 0, sizeof(struct gfs2_statfs_change_host));
500 gha = kcalloc(slots, sizeof(struct gfs2_holder), GFP_KERNEL);
501 if (!gha)
502 return -ENOMEM;
503
504 error = gfs2_rindex_hold(sdp, &ri_gh);
505 if (error)
506 goto out;
507
508 rgd_next = gfs2_rgrpd_get_first(sdp);
509
510 for (;;) {
511 done = 1;
512
513 for (x = 0; x < slots; x++) {
514 gh = gha + x;
515
516 if (gh->gh_gl && gfs2_glock_poll(gh)) {
517 err = gfs2_glock_wait(gh);
518 if (err) {
519 gfs2_holder_uninit(gh);
520 error = err;
521 } else {
522 if (!error)
523 error = statfs_slow_fill(
524 gh->gh_gl->gl_object, sc);
525 gfs2_glock_dq_uninit(gh);
526 }
527 }
528
529 if (gh->gh_gl)
530 done = 0;
531 else if (rgd_next && !error) {
532 error = gfs2_glock_nq_init(rgd_next->rd_gl,
533 LM_ST_SHARED,
534 GL_ASYNC,
535 gh);
536 rgd_next = gfs2_rgrpd_get_next(rgd_next);
537 done = 0;
538 }
539
540 if (signal_pending(current))
541 error = -ERESTARTSYS;
542 }
543
544 if (done)
545 break;
546
547 yield();
548 }
549
550 gfs2_glock_dq_uninit(&ri_gh);
551
552out:
553 kfree(gha);
554 return error;
555}
556
557struct lfcc { 323struct lfcc {
558 struct list_head list; 324 struct list_head list;
559 struct gfs2_holder gh; 325 struct gfs2_holder gh;
@@ -580,10 +346,6 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp,
580 struct gfs2_log_header_host lh; 346 struct gfs2_log_header_host lh;
581 int error; 347 int error;
582 348
583 error = gfs2_jindex_hold(sdp, &ji_gh);
584 if (error)
585 return error;
586
587 list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { 349 list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
588 lfcc = kmalloc(sizeof(struct lfcc), GFP_KERNEL); 350 lfcc = kmalloc(sizeof(struct lfcc), GFP_KERNEL);
589 if (!lfcc) { 351 if (!lfcc) {
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index 50a4c9b1215e..f6b8b00ad881 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -10,6 +10,8 @@
10#ifndef __SUPER_DOT_H__ 10#ifndef __SUPER_DOT_H__
11#define __SUPER_DOT_H__ 11#define __SUPER_DOT_H__
12 12
13#include <linux/fs.h>
14#include <linux/dcache.h>
13#include "incore.h" 15#include "incore.h"
14 16
15void gfs2_lm_unmount(struct gfs2_sbd *sdp); 17void gfs2_lm_unmount(struct gfs2_sbd *sdp);
@@ -23,12 +25,9 @@ static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
23 return x; 25 return x;
24} 26}
25 27
26int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh);
27void gfs2_jindex_free(struct gfs2_sbd *sdp); 28void gfs2_jindex_free(struct gfs2_sbd *sdp);
28 29
29struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid); 30struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid);
30void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid);
31struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp);
32int gfs2_jdesc_check(struct gfs2_jdesc *jd); 31int gfs2_jdesc_check(struct gfs2_jdesc *jd);
33 32
34int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename, 33int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename,
@@ -40,11 +39,15 @@ int gfs2_statfs_init(struct gfs2_sbd *sdp);
40void gfs2_statfs_change(struct gfs2_sbd *sdp, 39void gfs2_statfs_change(struct gfs2_sbd *sdp,
41 s64 total, s64 free, s64 dinodes); 40 s64 total, s64 free, s64 dinodes);
42int gfs2_statfs_sync(struct gfs2_sbd *sdp); 41int gfs2_statfs_sync(struct gfs2_sbd *sdp);
43int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc);
44int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc);
45 42
46int gfs2_freeze_fs(struct gfs2_sbd *sdp); 43int gfs2_freeze_fs(struct gfs2_sbd *sdp);
47void gfs2_unfreeze_fs(struct gfs2_sbd *sdp); 44void gfs2_unfreeze_fs(struct gfs2_sbd *sdp);
48 45
46extern struct file_system_type gfs2_fs_type;
47extern struct file_system_type gfs2meta_fs_type;
48extern const struct export_operations gfs2_export_ops;
49extern const struct super_operations gfs2_super_ops;
50extern struct dentry_operations gfs2_dops;
51
49#endif /* __SUPER_DOT_H__ */ 52#endif /* __SUPER_DOT_H__ */
50 53
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 7e1879f1a02c..26c1fa777a95 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -26,9 +26,6 @@
26#include "quota.h" 26#include "quota.h"
27#include "util.h" 27#include "util.h"
28 28
29char *gfs2_sys_margs;
30spinlock_t gfs2_sys_margs_lock;
31
32static ssize_t id_show(struct gfs2_sbd *sdp, char *buf) 29static ssize_t id_show(struct gfs2_sbd *sdp, char *buf)
33{ 30{
34 return snprintf(buf, PAGE_SIZE, "%u:%u\n", 31 return snprintf(buf, PAGE_SIZE, "%u:%u\n",
@@ -263,7 +260,6 @@ ARGS_ATTR(localcaching, "%d\n");
263ARGS_ATTR(localflocks, "%d\n"); 260ARGS_ATTR(localflocks, "%d\n");
264ARGS_ATTR(debug, "%d\n"); 261ARGS_ATTR(debug, "%d\n");
265ARGS_ATTR(upgrade, "%d\n"); 262ARGS_ATTR(upgrade, "%d\n");
266ARGS_ATTR(num_glockd, "%u\n");
267ARGS_ATTR(posix_acl, "%d\n"); 263ARGS_ATTR(posix_acl, "%d\n");
268ARGS_ATTR(quota, "%u\n"); 264ARGS_ATTR(quota, "%u\n");
269ARGS_ATTR(suiddir, "%d\n"); 265ARGS_ATTR(suiddir, "%d\n");
@@ -279,7 +275,6 @@ static struct attribute *args_attrs[] = {
279 &args_attr_localflocks.attr, 275 &args_attr_localflocks.attr,
280 &args_attr_debug.attr, 276 &args_attr_debug.attr,
281 &args_attr_upgrade.attr, 277 &args_attr_upgrade.attr,
282 &args_attr_num_glockd.attr,
283 &args_attr_posix_acl.attr, 278 &args_attr_posix_acl.attr,
284 &args_attr_quota.attr, 279 &args_attr_quota.attr,
285 &args_attr_suiddir.attr, 280 &args_attr_suiddir.attr,
@@ -288,30 +283,6 @@ static struct attribute *args_attrs[] = {
288}; 283};
289 284
290/* 285/*
291 * display counters from superblock
292 */
293
294struct counters_attr {
295 struct attribute attr;
296 ssize_t (*show)(struct gfs2_sbd *, char *);
297};
298
299#define COUNTERS_ATTR(name, fmt) \
300static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \
301{ \
302 return snprintf(buf, PAGE_SIZE, fmt, \
303 (unsigned int)atomic_read(&sdp->sd_##name)); \
304} \
305static struct counters_attr counters_attr_##name = __ATTR_RO(name)
306
307COUNTERS_ATTR(reclaimed, "%u\n");
308
309static struct attribute *counters_attrs[] = {
310 &counters_attr_reclaimed.attr,
311 NULL,
312};
313
314/*
315 * get and set struct gfs2_tune fields 286 * get and set struct gfs2_tune fields
316 */ 287 */
317 288
@@ -393,7 +364,6 @@ static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\
393} \ 364} \
394TUNE_ATTR_2(name, name##_store) 365TUNE_ATTR_2(name, name##_store)
395 366
396TUNE_ATTR(demote_secs, 0);
397TUNE_ATTR(incore_log_blocks, 0); 367TUNE_ATTR(incore_log_blocks, 0);
398TUNE_ATTR(log_flush_secs, 0); 368TUNE_ATTR(log_flush_secs, 0);
399TUNE_ATTR(quota_warn_period, 0); 369TUNE_ATTR(quota_warn_period, 0);
@@ -408,11 +378,9 @@ TUNE_ATTR(stall_secs, 1);
408TUNE_ATTR(statfs_quantum, 1); 378TUNE_ATTR(statfs_quantum, 1);
409TUNE_ATTR_DAEMON(recoverd_secs, recoverd_process); 379TUNE_ATTR_DAEMON(recoverd_secs, recoverd_process);
410TUNE_ATTR_DAEMON(logd_secs, logd_process); 380TUNE_ATTR_DAEMON(logd_secs, logd_process);
411TUNE_ATTR_DAEMON(quotad_secs, quotad_process);
412TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store); 381TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store);
413 382
414static struct attribute *tune_attrs[] = { 383static struct attribute *tune_attrs[] = {
415 &tune_attr_demote_secs.attr,
416 &tune_attr_incore_log_blocks.attr, 384 &tune_attr_incore_log_blocks.attr,
417 &tune_attr_log_flush_secs.attr, 385 &tune_attr_log_flush_secs.attr,
418 &tune_attr_quota_warn_period.attr, 386 &tune_attr_quota_warn_period.attr,
@@ -426,7 +394,6 @@ static struct attribute *tune_attrs[] = {
426 &tune_attr_statfs_quantum.attr, 394 &tune_attr_statfs_quantum.attr,
427 &tune_attr_recoverd_secs.attr, 395 &tune_attr_recoverd_secs.attr,
428 &tune_attr_logd_secs.attr, 396 &tune_attr_logd_secs.attr,
429 &tune_attr_quotad_secs.attr,
430 &tune_attr_quota_scale.attr, 397 &tune_attr_quota_scale.attr,
431 &tune_attr_new_files_jdata.attr, 398 &tune_attr_new_files_jdata.attr,
432 NULL, 399 NULL,
@@ -437,11 +404,6 @@ static struct attribute_group lockstruct_group = {
437 .attrs = lockstruct_attrs, 404 .attrs = lockstruct_attrs,
438}; 405};
439 406
440static struct attribute_group counters_group = {
441 .name = "counters",
442 .attrs = counters_attrs,
443};
444
445static struct attribute_group args_group = { 407static struct attribute_group args_group = {
446 .name = "args", 408 .name = "args",
447 .attrs = args_attrs, 409 .attrs = args_attrs,
@@ -466,13 +428,9 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
466 if (error) 428 if (error)
467 goto fail_reg; 429 goto fail_reg;
468 430
469 error = sysfs_create_group(&sdp->sd_kobj, &counters_group);
470 if (error)
471 goto fail_lockstruct;
472
473 error = sysfs_create_group(&sdp->sd_kobj, &args_group); 431 error = sysfs_create_group(&sdp->sd_kobj, &args_group);
474 if (error) 432 if (error)
475 goto fail_counters; 433 goto fail_lockstruct;
476 434
477 error = sysfs_create_group(&sdp->sd_kobj, &tune_group); 435 error = sysfs_create_group(&sdp->sd_kobj, &tune_group);
478 if (error) 436 if (error)
@@ -483,8 +441,6 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
483 441
484fail_args: 442fail_args:
485 sysfs_remove_group(&sdp->sd_kobj, &args_group); 443 sysfs_remove_group(&sdp->sd_kobj, &args_group);
486fail_counters:
487 sysfs_remove_group(&sdp->sd_kobj, &counters_group);
488fail_lockstruct: 444fail_lockstruct:
489 sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group); 445 sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
490fail_reg: 446fail_reg:
@@ -498,16 +454,27 @@ void gfs2_sys_fs_del(struct gfs2_sbd *sdp)
498{ 454{
499 sysfs_remove_group(&sdp->sd_kobj, &tune_group); 455 sysfs_remove_group(&sdp->sd_kobj, &tune_group);
500 sysfs_remove_group(&sdp->sd_kobj, &args_group); 456 sysfs_remove_group(&sdp->sd_kobj, &args_group);
501 sysfs_remove_group(&sdp->sd_kobj, &counters_group);
502 sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group); 457 sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
503 kobject_put(&sdp->sd_kobj); 458 kobject_put(&sdp->sd_kobj);
504} 459}
505 460
461static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
462 struct kobj_uevent_env *env)
463{
464 struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
465 add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name);
466 add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name);
467 return 0;
468}
469
470static struct kset_uevent_ops gfs2_uevent_ops = {
471 .uevent = gfs2_uevent,
472};
473
474
506int gfs2_sys_init(void) 475int gfs2_sys_init(void)
507{ 476{
508 gfs2_sys_margs = NULL; 477 gfs2_kset = kset_create_and_add("gfs2", &gfs2_uevent_ops, fs_kobj);
509 spin_lock_init(&gfs2_sys_margs_lock);
510 gfs2_kset = kset_create_and_add("gfs2", NULL, fs_kobj);
511 if (!gfs2_kset) 478 if (!gfs2_kset)
512 return -ENOMEM; 479 return -ENOMEM;
513 return 0; 480 return 0;
@@ -515,7 +482,6 @@ int gfs2_sys_init(void)
515 482
516void gfs2_sys_uninit(void) 483void gfs2_sys_uninit(void)
517{ 484{
518 kfree(gfs2_sys_margs);
519 kset_unregister(gfs2_kset); 485 kset_unregister(gfs2_kset);
520} 486}
521 487
diff --git a/fs/gfs2/sys.h b/fs/gfs2/sys.h
index 1ca8cdac5304..e94560e836d7 100644
--- a/fs/gfs2/sys.h
+++ b/fs/gfs2/sys.h
@@ -13,10 +13,6 @@
13#include <linux/spinlock.h> 13#include <linux/spinlock.h>
14struct gfs2_sbd; 14struct gfs2_sbd;
15 15
16/* Allow args to be passed to GFS2 when using an initial ram disk */
17extern char *gfs2_sys_margs;
18extern spinlock_t gfs2_sys_margs_lock;
19
20int gfs2_sys_fs_add(struct gfs2_sbd *sdp); 16int gfs2_sys_fs_add(struct gfs2_sbd *sdp);
21void gfs2_sys_fs_del(struct gfs2_sbd *sdp); 17void gfs2_sys_fs_del(struct gfs2_sbd *sdp);
22 18
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index d31e355c61fb..374f50e95496 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -25,6 +25,7 @@ struct kmem_cache *gfs2_glock_cachep __read_mostly;
25struct kmem_cache *gfs2_inode_cachep __read_mostly; 25struct kmem_cache *gfs2_inode_cachep __read_mostly;
26struct kmem_cache *gfs2_bufdata_cachep __read_mostly; 26struct kmem_cache *gfs2_bufdata_cachep __read_mostly;
27struct kmem_cache *gfs2_rgrpd_cachep __read_mostly; 27struct kmem_cache *gfs2_rgrpd_cachep __read_mostly;
28struct kmem_cache *gfs2_quotad_cachep __read_mostly;
28 29
29void gfs2_assert_i(struct gfs2_sbd *sdp) 30void gfs2_assert_i(struct gfs2_sbd *sdp)
30{ 31{
diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h
index 7f48576289c9..33e96b0ce9ab 100644
--- a/fs/gfs2/util.h
+++ b/fs/gfs2/util.h
@@ -148,6 +148,7 @@ extern struct kmem_cache *gfs2_glock_cachep;
148extern struct kmem_cache *gfs2_inode_cachep; 148extern struct kmem_cache *gfs2_inode_cachep;
149extern struct kmem_cache *gfs2_bufdata_cachep; 149extern struct kmem_cache *gfs2_bufdata_cachep;
150extern struct kmem_cache *gfs2_rgrpd_cachep; 150extern struct kmem_cache *gfs2_rgrpd_cachep;
151extern struct kmem_cache *gfs2_quotad_cachep;
151 152
152static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt, 153static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt,
153 unsigned int *p) 154 unsigned int *p)
diff --git a/fs/hfs/Kconfig b/fs/hfs/Kconfig
new file mode 100644
index 000000000000..b77c5bc20f8a
--- /dev/null
+++ b/fs/hfs/Kconfig
@@ -0,0 +1,12 @@
1config HFS_FS
2 tristate "Apple Macintosh file system support (EXPERIMENTAL)"
3 depends on BLOCK && EXPERIMENTAL
4 select NLS
5 help
6 If you say Y here, you will be able to mount Macintosh-formatted
7 floppy disks and hard drive partitions with full read-write access.
8 Please read <file:Documentation/filesystems/hfs.txt> to learn about
9 the available mount options.
10
11 To compile this file system support as a module, choose M here: the
12 module will be called hfs.
diff --git a/fs/hfsplus/Kconfig b/fs/hfsplus/Kconfig
new file mode 100644
index 000000000000..a63371815aab
--- /dev/null
+++ b/fs/hfsplus/Kconfig
@@ -0,0 +1,13 @@
1config HFSPLUS_FS
2 tristate "Apple Extended HFS file system support"
3 depends on BLOCK
4 select NLS
5 select NLS_UTF8
6 help
7 If you say Y here, you will be able to mount extended format
8 Macintosh-formatted hard drive partitions with full read-write access.
9
10 This file system is often called HFS+ and was introduced with
11 MacOS 8. It includes all Mac specific filesystem data such as
12 data forks and creator codes, but it also has several UNIX
13 style features such as file ownership and permissions.
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 3a31451ac170..5c538e0ec14b 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -501,7 +501,7 @@ int hostfs_write_begin(struct file *file, struct address_space *mapping,
501{ 501{
502 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 502 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
503 503
504 *pagep = __grab_cache_page(mapping, index); 504 *pagep = grab_cache_page_write_begin(mapping, index, flags);
505 if (!*pagep) 505 if (!*pagep)
506 return -ENOMEM; 506 return -ENOMEM;
507 return 0; 507 return 0;
diff --git a/fs/hpfs/Kconfig b/fs/hpfs/Kconfig
new file mode 100644
index 000000000000..56bd15c5bf6c
--- /dev/null
+++ b/fs/hpfs/Kconfig
@@ -0,0 +1,14 @@
1config HPFS_FS
2 tristate "OS/2 HPFS file system support"
3 depends on BLOCK
4 help
5 OS/2 is IBM's operating system for PC's, the same as Warp, and HPFS
6 is the file system used for organizing files on OS/2 hard disk
7 partitions. Say Y if you want to be able to read files from and
8 write files to an OS/2 HPFS partition on your hard drive. OS/2
9 floppies however are in regular MSDOS format, so you don't need this
10 option in order to be able to read them. Read
11 <file:Documentation/filesystems/hpfs.txt>.
12
13 To compile this file system support as a module, choose M here: the
14 module will be called hpfs. If unsure, say N.
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 7d479ce3aceb..6903d37af037 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -252,6 +252,7 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
252 for (;;) { 252 for (;;) {
253 struct page *page; 253 struct page *page;
254 unsigned long nr, ret; 254 unsigned long nr, ret;
255 int ra;
255 256
256 /* nr is the maximum number of bytes to copy from this page */ 257 /* nr is the maximum number of bytes to copy from this page */
257 nr = huge_page_size(h); 258 nr = huge_page_size(h);
@@ -274,16 +275,19 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
274 */ 275 */
275 ret = len < nr ? len : nr; 276 ret = len < nr ? len : nr;
276 if (clear_user(buf, ret)) 277 if (clear_user(buf, ret))
277 ret = -EFAULT; 278 ra = -EFAULT;
279 else
280 ra = 0;
278 } else { 281 } else {
279 /* 282 /*
280 * We have the page, copy it to user space buffer. 283 * We have the page, copy it to user space buffer.
281 */ 284 */
282 ret = hugetlbfs_read_actor(page, offset, buf, len, nr); 285 ra = hugetlbfs_read_actor(page, offset, buf, len, nr);
286 ret = ra;
283 } 287 }
284 if (ret < 0) { 288 if (ra < 0) {
285 if (retval == 0) 289 if (retval == 0)
286 retval = ret; 290 retval = ra;
287 if (page) 291 if (page)
288 page_cache_release(page); 292 page_cache_release(page);
289 goto out; 293 goto out;
@@ -506,7 +510,6 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
506 inode->i_mode = mode; 510 inode->i_mode = mode;
507 inode->i_uid = uid; 511 inode->i_uid = uid;
508 inode->i_gid = gid; 512 inode->i_gid = gid;
509 inode->i_blocks = 0;
510 inode->i_mapping->a_ops = &hugetlbfs_aops; 513 inode->i_mapping->a_ops = &hugetlbfs_aops;
511 inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info; 514 inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info;
512 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 515 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/inode.c b/fs/inode.c
index ed22b14f2202..40e37c026565 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -23,6 +23,7 @@
23#include <linux/bootmem.h> 23#include <linux/bootmem.h>
24#include <linux/inotify.h> 24#include <linux/inotify.h>
25#include <linux/mount.h> 25#include <linux/mount.h>
26#include <linux/async.h>
26 27
27/* 28/*
28 * This is needed for the following functions: 29 * This is needed for the following functions:
@@ -111,8 +112,8 @@ static void wake_up_inode(struct inode *inode)
111 112
112/** 113/**
113 * inode_init_always - perform inode structure intialisation 114 * inode_init_always - perform inode structure intialisation
114 * @sb - superblock inode belongs to. 115 * @sb: superblock inode belongs to
115 * @inode - inode to initialise 116 * @inode: inode to initialise
116 * 117 *
117 * These are initializations that need to be done on every inode 118 * These are initializations that need to be done on every inode
118 * allocation as the fields are not initialised by slab allocation. 119 * allocation as the fields are not initialised by slab allocation.
@@ -132,6 +133,8 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
132 inode->i_op = &empty_iops; 133 inode->i_op = &empty_iops;
133 inode->i_fop = &empty_fops; 134 inode->i_fop = &empty_fops;
134 inode->i_nlink = 1; 135 inode->i_nlink = 1;
136 inode->i_uid = 0;
137 inode->i_gid = 0;
135 atomic_set(&inode->i_writecount, 0); 138 atomic_set(&inode->i_writecount, 0);
136 inode->i_size = 0; 139 inode->i_size = 0;
137 inode->i_blocks = 0; 140 inode->i_blocks = 0;
@@ -165,7 +168,7 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
165 mapping->a_ops = &empty_aops; 168 mapping->a_ops = &empty_aops;
166 mapping->host = inode; 169 mapping->host = inode;
167 mapping->flags = 0; 170 mapping->flags = 0;
168 mapping_set_gfp_mask(mapping, GFP_HIGHUSER_PAGECACHE); 171 mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
169 mapping->assoc_mapping = NULL; 172 mapping->assoc_mapping = NULL;
170 mapping->backing_dev_info = &default_backing_dev_info; 173 mapping->backing_dev_info = &default_backing_dev_info;
171 mapping->writeback_index = 0; 174 mapping->writeback_index = 0;
@@ -584,8 +587,8 @@ __inode_add_to_lists(struct super_block *sb, struct hlist_head *head,
584 587
585/** 588/**
586 * inode_add_to_lists - add a new inode to relevant lists 589 * inode_add_to_lists - add a new inode to relevant lists
587 * @sb - superblock inode belongs to. 590 * @sb: superblock inode belongs to
588 * @inode - inode to mark in use 591 * @inode: inode to mark in use
589 * 592 *
590 * When an inode is allocated it needs to be accounted for, added to the in use 593 * When an inode is allocated it needs to be accounted for, added to the in use
591 * list, the owning superblock and the inode hash. This needs to be done under 594 * list, the owning superblock and the inode hash. This needs to be done under
@@ -609,7 +612,7 @@ EXPORT_SYMBOL_GPL(inode_add_to_lists);
609 * @sb: superblock 612 * @sb: superblock
610 * 613 *
611 * Allocates a new inode for given superblock. The default gfp_mask 614 * Allocates a new inode for given superblock. The default gfp_mask
612 * for allocations related to inode->i_mapping is GFP_HIGHUSER_PAGECACHE. 615 * for allocations related to inode->i_mapping is GFP_HIGHUSER_MOVABLE.
613 * If HIGHMEM pages are unsuitable or it is known that pages allocated 616 * If HIGHMEM pages are unsuitable or it is known that pages allocated
614 * for the page cache are not reclaimable or migratable, 617 * for the page cache are not reclaimable or migratable,
615 * mapping_set_gfp_mask() must be called with suitable flags on the 618 * mapping_set_gfp_mask() must be called with suitable flags on the
@@ -1042,6 +1045,65 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino)
1042 1045
1043EXPORT_SYMBOL(iget_locked); 1046EXPORT_SYMBOL(iget_locked);
1044 1047
1048int insert_inode_locked(struct inode *inode)
1049{
1050 struct super_block *sb = inode->i_sb;
1051 ino_t ino = inode->i_ino;
1052 struct hlist_head *head = inode_hashtable + hash(sb, ino);
1053 struct inode *old;
1054
1055 inode->i_state |= I_LOCK|I_NEW;
1056 while (1) {
1057 spin_lock(&inode_lock);
1058 old = find_inode_fast(sb, head, ino);
1059 if (likely(!old)) {
1060 hlist_add_head(&inode->i_hash, head);
1061 spin_unlock(&inode_lock);
1062 return 0;
1063 }
1064 __iget(old);
1065 spin_unlock(&inode_lock);
1066 wait_on_inode(old);
1067 if (unlikely(!hlist_unhashed(&old->i_hash))) {
1068 iput(old);
1069 return -EBUSY;
1070 }
1071 iput(old);
1072 }
1073}
1074
1075EXPORT_SYMBOL(insert_inode_locked);
1076
1077int insert_inode_locked4(struct inode *inode, unsigned long hashval,
1078 int (*test)(struct inode *, void *), void *data)
1079{
1080 struct super_block *sb = inode->i_sb;
1081 struct hlist_head *head = inode_hashtable + hash(sb, hashval);
1082 struct inode *old;
1083
1084 inode->i_state |= I_LOCK|I_NEW;
1085
1086 while (1) {
1087 spin_lock(&inode_lock);
1088 old = find_inode(sb, head, test, data);
1089 if (likely(!old)) {
1090 hlist_add_head(&inode->i_hash, head);
1091 spin_unlock(&inode_lock);
1092 return 0;
1093 }
1094 __iget(old);
1095 spin_unlock(&inode_lock);
1096 wait_on_inode(old);
1097 if (unlikely(!hlist_unhashed(&old->i_hash))) {
1098 iput(old);
1099 return -EBUSY;
1100 }
1101 iput(old);
1102 }
1103}
1104
1105EXPORT_SYMBOL(insert_inode_locked4);
1106
1045/** 1107/**
1046 * __insert_inode_hash - hash an inode 1108 * __insert_inode_hash - hash an inode
1047 * @inode: unhashed inode 1109 * @inode: unhashed inode
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 43e8b2c0664b..240ec63984cb 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -231,7 +231,8 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
231#define blk_to_logical(inode, blk) (blk << (inode)->i_blkbits) 231#define blk_to_logical(inode, blk) (blk << (inode)->i_blkbits)
232#define logical_to_blk(inode, offset) (offset >> (inode)->i_blkbits); 232#define logical_to_blk(inode, offset) (offset >> (inode)->i_blkbits);
233 233
234/* 234/**
235 * __generic_block_fiemap - FIEMAP for block based inodes (no locking)
235 * @inode - the inode to map 236 * @inode - the inode to map
236 * @arg - the pointer to userspace where we copy everything to 237 * @arg - the pointer to userspace where we copy everything to
237 * @get_block - the fs's get_block function 238 * @get_block - the fs's get_block function
@@ -242,11 +243,15 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
242 * 243 *
243 * If it is possible to have data blocks beyond a hole past @inode->i_size, then 244 * If it is possible to have data blocks beyond a hole past @inode->i_size, then
244 * please do not use this function, it will stop at the first unmapped block 245 * please do not use this function, it will stop at the first unmapped block
245 * beyond i_size 246 * beyond i_size.
247 *
248 * If you use this function directly, you need to do your own locking. Use
249 * generic_block_fiemap if you want the locking done for you.
246 */ 250 */
247int generic_block_fiemap(struct inode *inode, 251
248 struct fiemap_extent_info *fieinfo, u64 start, 252int __generic_block_fiemap(struct inode *inode,
249 u64 len, get_block_t *get_block) 253 struct fiemap_extent_info *fieinfo, u64 start,
254 u64 len, get_block_t *get_block)
250{ 255{
251 struct buffer_head tmp; 256 struct buffer_head tmp;
252 unsigned int start_blk; 257 unsigned int start_blk;
@@ -260,9 +265,6 @@ int generic_block_fiemap(struct inode *inode,
260 265
261 start_blk = logical_to_blk(inode, start); 266 start_blk = logical_to_blk(inode, start);
262 267
263 /* guard against change */
264 mutex_lock(&inode->i_mutex);
265
266 length = (long long)min_t(u64, len, i_size_read(inode)); 268 length = (long long)min_t(u64, len, i_size_read(inode));
267 map_len = length; 269 map_len = length;
268 270
@@ -334,14 +336,36 @@ int generic_block_fiemap(struct inode *inode,
334 cond_resched(); 336 cond_resched();
335 } while (1); 337 } while (1);
336 338
337 mutex_unlock(&inode->i_mutex);
338
339 /* if ret is 1 then we just hit the end of the extent array */ 339 /* if ret is 1 then we just hit the end of the extent array */
340 if (ret == 1) 340 if (ret == 1)
341 ret = 0; 341 ret = 0;
342 342
343 return ret; 343 return ret;
344} 344}
345EXPORT_SYMBOL(__generic_block_fiemap);
346
347/**
348 * generic_block_fiemap - FIEMAP for block based inodes
349 * @inode: The inode to map
350 * @fieinfo: The mapping information
351 * @start: The initial block to map
352 * @len: The length of the extect to attempt to map
353 * @get_block: The block mapping function for the fs
354 *
355 * Calls __generic_block_fiemap to map the inode, after taking
356 * the inode's mutex lock.
357 */
358
359int generic_block_fiemap(struct inode *inode,
360 struct fiemap_extent_info *fieinfo, u64 start,
361 u64 len, get_block_t *get_block)
362{
363 int ret;
364 mutex_lock(&inode->i_mutex);
365 ret = __generic_block_fiemap(inode, fieinfo, start, len, get_block);
366 mutex_unlock(&inode->i_mutex);
367 return ret;
368}
345EXPORT_SYMBOL(generic_block_fiemap); 369EXPORT_SYMBOL(generic_block_fiemap);
346 370
347#endif /* CONFIG_BLOCK */ 371#endif /* CONFIG_BLOCK */
@@ -415,6 +439,43 @@ static int ioctl_fioasync(unsigned int fd, struct file *filp,
415 return error; 439 return error;
416} 440}
417 441
442static int ioctl_fsfreeze(struct file *filp)
443{
444 struct super_block *sb = filp->f_path.dentry->d_inode->i_sb;
445
446 if (!capable(CAP_SYS_ADMIN))
447 return -EPERM;
448
449 /* If filesystem doesn't support freeze feature, return. */
450 if (sb->s_op->freeze_fs == NULL)
451 return -EOPNOTSUPP;
452
453 /* If a blockdevice-backed filesystem isn't specified, return. */
454 if (sb->s_bdev == NULL)
455 return -EINVAL;
456
457 /* Freeze */
458 sb = freeze_bdev(sb->s_bdev);
459 if (IS_ERR(sb))
460 return PTR_ERR(sb);
461 return 0;
462}
463
464static int ioctl_fsthaw(struct file *filp)
465{
466 struct super_block *sb = filp->f_path.dentry->d_inode->i_sb;
467
468 if (!capable(CAP_SYS_ADMIN))
469 return -EPERM;
470
471 /* If a blockdevice-backed filesystem isn't specified, return EINVAL. */
472 if (sb->s_bdev == NULL)
473 return -EINVAL;
474
475 /* Thaw */
476 return thaw_bdev(sb->s_bdev, sb);
477}
478
418/* 479/*
419 * When you add any new common ioctls to the switches above and below 480 * When you add any new common ioctls to the switches above and below
420 * please update compat_sys_ioctl() too. 481 * please update compat_sys_ioctl() too.
@@ -462,6 +523,15 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
462 } else 523 } else
463 error = -ENOTTY; 524 error = -ENOTTY;
464 break; 525 break;
526
527 case FIFREEZE:
528 error = ioctl_fsfreeze(filp);
529 break;
530
531 case FITHAW:
532 error = ioctl_fsthaw(filp);
533 break;
534
465 default: 535 default:
466 if (S_ISREG(filp->f_path.dentry->d_inode->i_mode)) 536 if (S_ISREG(filp->f_path.dentry->d_inode->i_mode))
467 error = file_ioctl(filp, cmd, arg); 537 error = file_ioctl(filp, cmd, arg);
@@ -472,7 +542,7 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
472 return error; 542 return error;
473} 543}
474 544
475asmlinkage long sys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg) 545SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
476{ 546{
477 struct file *filp; 547 struct file *filp;
478 int error = -EBADF; 548 int error = -EBADF;
diff --git a/fs/ioprio.c b/fs/ioprio.c
index 3569e0ad86a2..c7c0b28d7d21 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -27,7 +27,7 @@
27#include <linux/security.h> 27#include <linux/security.h>
28#include <linux/pid_namespace.h> 28#include <linux/pid_namespace.h>
29 29
30static int set_task_ioprio(struct task_struct *task, int ioprio) 30int set_task_ioprio(struct task_struct *task, int ioprio)
31{ 31{
32 int err; 32 int err;
33 struct io_context *ioc; 33 struct io_context *ioc;
@@ -70,8 +70,9 @@ static int set_task_ioprio(struct task_struct *task, int ioprio)
70 task_unlock(task); 70 task_unlock(task);
71 return err; 71 return err;
72} 72}
73EXPORT_SYMBOL_GPL(set_task_ioprio);
73 74
74asmlinkage long sys_ioprio_set(int which, int who, int ioprio) 75SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
75{ 76{
76 int class = IOPRIO_PRIO_CLASS(ioprio); 77 int class = IOPRIO_PRIO_CLASS(ioprio);
77 int data = IOPRIO_PRIO_DATA(ioprio); 78 int data = IOPRIO_PRIO_DATA(ioprio);
@@ -187,7 +188,7 @@ int ioprio_best(unsigned short aprio, unsigned short bprio)
187 return aprio; 188 return aprio;
188} 189}
189 190
190asmlinkage long sys_ioprio_get(int which, int who) 191SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
191{ 192{
192 struct task_struct *g, *p; 193 struct task_struct *g, *p;
193 struct user_struct *user; 194 struct user_struct *user;
@@ -251,4 +252,3 @@ asmlinkage long sys_ioprio_get(int which, int who)
251 read_unlock(&tasklist_lock); 252 read_unlock(&tasklist_lock);
252 return ret; 253 return ret;
253} 254}
254
diff --git a/fs/isofs/Kconfig b/fs/isofs/Kconfig
new file mode 100644
index 000000000000..8ab9878e3671
--- /dev/null
+++ b/fs/isofs/Kconfig
@@ -0,0 +1,39 @@
1config ISO9660_FS
2 tristate "ISO 9660 CDROM file system support"
3 help
4 This is the standard file system used on CD-ROMs. It was previously
5 known as "High Sierra File System" and is called "hsfs" on other
6 Unix systems. The so-called Rock-Ridge extensions which allow for
7 long Unix filenames and symbolic links are also supported by this
8 driver. If you have a CD-ROM drive and want to do more with it than
9 just listen to audio CDs and watch its LEDs, say Y (and read
10 <file:Documentation/filesystems/isofs.txt> and the CD-ROM-HOWTO,
11 available from <http://www.tldp.org/docs.html#howto>), thereby
12 enlarging your kernel by about 27 KB; otherwise say N.
13
14 To compile this file system support as a module, choose M here: the
15 module will be called isofs.
16
17config JOLIET
18 bool "Microsoft Joliet CDROM extensions"
19 depends on ISO9660_FS
20 select NLS
21 help
22 Joliet is a Microsoft extension for the ISO 9660 CD-ROM file system
23 which allows for long filenames in unicode format (unicode is the
24 new 16 bit character code, successor to ASCII, which encodes the
25 characters of almost all languages of the world; see
26 <http://www.unicode.org/> for more information). Say Y here if you
27 want to be able to read Joliet CD-ROMs under Linux.
28
29config ZISOFS
30 bool "Transparent decompression extension"
31 depends on ISO9660_FS
32 select ZLIB_INFLATE
33 help
34 This is a Linux-specific extension to RockRidge which lets you store
35 data in compressed form on a CD-ROM and have it transparently
36 decompressed when the CD-ROM is accessed. See
37 <http://www.kernel.org/pub/linux/utils/fs/zisofs/> for the tools
38 necessary to create such a filesystem. Say Y here if you want to be
39 able to read such compressed CD-ROMs.
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 3f8af0f1505b..6147ec3643a0 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -855,10 +855,6 @@ root_found:
855 } 855 }
856 sbi->s_joliet_level = joliet_level; 856 sbi->s_joliet_level = joliet_level;
857 857
858 /* check the root inode */
859 if (!inode->i_op)
860 goto out_bad_root;
861
862 /* Make sure the root inode is a directory */ 858 /* Make sure the root inode is a directory */
863 if (!S_ISDIR(inode->i_mode)) { 859 if (!S_ISDIR(inode->i_mode)) {
864 printk(KERN_WARNING 860 printk(KERN_WARNING
@@ -886,8 +882,6 @@ root_found:
886 /* 882 /*
887 * Display error messages and free resources. 883 * Display error messages and free resources.
888 */ 884 */
889out_bad_root:
890 printk(KERN_WARNING "%s: root inode not initialized\n", __func__);
891out_iput: 885out_iput:
892 iput(inode); 886 iput(inode);
893 goto out_no_inode; 887 goto out_no_inode;
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 25719d902c51..3fbffb1ea714 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -306,6 +306,8 @@ void journal_commit_transaction(journal_t *journal)
306 int flags; 306 int flags;
307 int err; 307 int err;
308 unsigned long blocknr; 308 unsigned long blocknr;
309 ktime_t start_time;
310 u64 commit_time;
309 char *tagp = NULL; 311 char *tagp = NULL;
310 journal_header_t *header; 312 journal_header_t *header;
311 journal_block_tag_t *tag = NULL; 313 journal_block_tag_t *tag = NULL;
@@ -418,6 +420,7 @@ void journal_commit_transaction(journal_t *journal)
418 commit_transaction->t_state = T_FLUSH; 420 commit_transaction->t_state = T_FLUSH;
419 journal->j_committing_transaction = commit_transaction; 421 journal->j_committing_transaction = commit_transaction;
420 journal->j_running_transaction = NULL; 422 journal->j_running_transaction = NULL;
423 start_time = ktime_get();
421 commit_transaction->t_log_start = journal->j_head; 424 commit_transaction->t_log_start = journal->j_head;
422 wake_up(&journal->j_wait_transaction_locked); 425 wake_up(&journal->j_wait_transaction_locked);
423 spin_unlock(&journal->j_state_lock); 426 spin_unlock(&journal->j_state_lock);
@@ -913,6 +916,18 @@ restart_loop:
913 J_ASSERT(commit_transaction == journal->j_committing_transaction); 916 J_ASSERT(commit_transaction == journal->j_committing_transaction);
914 journal->j_commit_sequence = commit_transaction->t_tid; 917 journal->j_commit_sequence = commit_transaction->t_tid;
915 journal->j_committing_transaction = NULL; 918 journal->j_committing_transaction = NULL;
919 commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
920
921 /*
922 * weight the commit time higher than the average time so we don't
923 * react too strongly to vast changes in commit time
924 */
925 if (likely(journal->j_average_commit_time))
926 journal->j_average_commit_time = (commit_time*3 +
927 journal->j_average_commit_time) / 4;
928 else
929 journal->j_average_commit_time = commit_time;
930
916 spin_unlock(&journal->j_state_lock); 931 spin_unlock(&journal->j_state_lock);
917 932
918 if (commit_transaction->t_checkpoint_list == NULL && 933 if (commit_transaction->t_checkpoint_list == NULL &&
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 60d4c32c8808..e6a117431277 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -25,6 +25,7 @@
25#include <linux/timer.h> 25#include <linux/timer.h>
26#include <linux/mm.h> 26#include <linux/mm.h>
27#include <linux/highmem.h> 27#include <linux/highmem.h>
28#include <linux/hrtimer.h>
28 29
29static void __journal_temp_unlink_buffer(struct journal_head *jh); 30static void __journal_temp_unlink_buffer(struct journal_head *jh);
30 31
@@ -49,6 +50,7 @@ get_transaction(journal_t *journal, transaction_t *transaction)
49{ 50{
50 transaction->t_journal = journal; 51 transaction->t_journal = journal;
51 transaction->t_state = T_RUNNING; 52 transaction->t_state = T_RUNNING;
53 transaction->t_start_time = ktime_get();
52 transaction->t_tid = journal->j_transaction_sequence++; 54 transaction->t_tid = journal->j_transaction_sequence++;
53 transaction->t_expires = jiffies + journal->j_commit_interval; 55 transaction->t_expires = jiffies + journal->j_commit_interval;
54 spin_lock_init(&transaction->t_handle_lock); 56 spin_lock_init(&transaction->t_handle_lock);
@@ -752,7 +754,6 @@ out:
752 * int journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update. 754 * int journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update.
753 * @handle: transaction to add buffer modifications to 755 * @handle: transaction to add buffer modifications to
754 * @bh: bh to be used for metadata writes 756 * @bh: bh to be used for metadata writes
755 * @credits: variable that will receive credits for the buffer
756 * 757 *
757 * Returns an error code or 0 on success. 758 * Returns an error code or 0 on success.
758 * 759 *
@@ -1370,7 +1371,7 @@ int journal_stop(handle_t *handle)
1370{ 1371{
1371 transaction_t *transaction = handle->h_transaction; 1372 transaction_t *transaction = handle->h_transaction;
1372 journal_t *journal = transaction->t_journal; 1373 journal_t *journal = transaction->t_journal;
1373 int old_handle_count, err; 1374 int err;
1374 pid_t pid; 1375 pid_t pid;
1375 1376
1376 J_ASSERT(journal_current_handle() == handle); 1377 J_ASSERT(journal_current_handle() == handle);
@@ -1399,6 +1400,17 @@ int journal_stop(handle_t *handle)
1399 * on IO anyway. Speeds up many-threaded, many-dir operations 1400 * on IO anyway. Speeds up many-threaded, many-dir operations
1400 * by 30x or more... 1401 * by 30x or more...
1401 * 1402 *
1403 * We try and optimize the sleep time against what the underlying disk
1404 * can do, instead of having a static sleep time. This is usefull for
1405 * the case where our storage is so fast that it is more optimal to go
1406 * ahead and force a flush and wait for the transaction to be committed
1407 * than it is to wait for an arbitrary amount of time for new writers to
1408 * join the transaction. We acheive this by measuring how long it takes
1409 * to commit a transaction, and compare it with how long this
1410 * transaction has been running, and if run time < commit time then we
1411 * sleep for the delta and commit. This greatly helps super fast disks
1412 * that would see slowdowns as more threads started doing fsyncs.
1413 *
1402 * But don't do this if this process was the most recent one to 1414 * But don't do this if this process was the most recent one to
1403 * perform a synchronous write. We do this to detect the case where a 1415 * perform a synchronous write. We do this to detect the case where a
1404 * single process is doing a stream of sync writes. No point in waiting 1416 * single process is doing a stream of sync writes. No point in waiting
@@ -1406,11 +1418,26 @@ int journal_stop(handle_t *handle)
1406 */ 1418 */
1407 pid = current->pid; 1419 pid = current->pid;
1408 if (handle->h_sync && journal->j_last_sync_writer != pid) { 1420 if (handle->h_sync && journal->j_last_sync_writer != pid) {
1421 u64 commit_time, trans_time;
1422
1409 journal->j_last_sync_writer = pid; 1423 journal->j_last_sync_writer = pid;
1410 do { 1424
1411 old_handle_count = transaction->t_handle_count; 1425 spin_lock(&journal->j_state_lock);
1412 schedule_timeout_uninterruptible(1); 1426 commit_time = journal->j_average_commit_time;
1413 } while (old_handle_count != transaction->t_handle_count); 1427 spin_unlock(&journal->j_state_lock);
1428
1429 trans_time = ktime_to_ns(ktime_sub(ktime_get(),
1430 transaction->t_start_time));
1431
1432 commit_time = min_t(u64, commit_time,
1433 1000*jiffies_to_usecs(1));
1434
1435 if (trans_time < commit_time) {
1436 ktime_t expires = ktime_add_ns(ktime_get(),
1437 commit_time);
1438 set_current_state(TASK_UNINTERRUPTIBLE);
1439 schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
1440 }
1414 } 1441 }
1415 1442
1416 current->journal_info = NULL; 1443 current->journal_info = NULL;
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 9497718fe920..17159cacbd9e 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -249,16 +249,14 @@ restart:
249 return ret; 249 return ret;
250} 250}
251 251
252#define NR_BATCH 64
253
254static void 252static void
255__flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count) 253__flush_batch(journal_t *journal, int *batch_count)
256{ 254{
257 int i; 255 int i;
258 256
259 ll_rw_block(SWRITE, *batch_count, bhs); 257 ll_rw_block(SWRITE, *batch_count, journal->j_chkpt_bhs);
260 for (i = 0; i < *batch_count; i++) { 258 for (i = 0; i < *batch_count; i++) {
261 struct buffer_head *bh = bhs[i]; 259 struct buffer_head *bh = journal->j_chkpt_bhs[i];
262 clear_buffer_jwrite(bh); 260 clear_buffer_jwrite(bh);
263 BUFFER_TRACE(bh, "brelse"); 261 BUFFER_TRACE(bh, "brelse");
264 __brelse(bh); 262 __brelse(bh);
@@ -277,8 +275,7 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
277 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it 275 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
278 */ 276 */
279static int __process_buffer(journal_t *journal, struct journal_head *jh, 277static int __process_buffer(journal_t *journal, struct journal_head *jh,
280 struct buffer_head **bhs, int *batch_count, 278 int *batch_count, transaction_t *transaction)
281 transaction_t *transaction)
282{ 279{
283 struct buffer_head *bh = jh2bh(jh); 280 struct buffer_head *bh = jh2bh(jh);
284 int ret = 0; 281 int ret = 0;
@@ -325,14 +322,14 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
325 get_bh(bh); 322 get_bh(bh);
326 J_ASSERT_BH(bh, !buffer_jwrite(bh)); 323 J_ASSERT_BH(bh, !buffer_jwrite(bh));
327 set_buffer_jwrite(bh); 324 set_buffer_jwrite(bh);
328 bhs[*batch_count] = bh; 325 journal->j_chkpt_bhs[*batch_count] = bh;
329 __buffer_relink_io(jh); 326 __buffer_relink_io(jh);
330 jbd_unlock_bh_state(bh); 327 jbd_unlock_bh_state(bh);
331 transaction->t_chp_stats.cs_written++; 328 transaction->t_chp_stats.cs_written++;
332 (*batch_count)++; 329 (*batch_count)++;
333 if (*batch_count == NR_BATCH) { 330 if (*batch_count == JBD2_NR_BATCH) {
334 spin_unlock(&journal->j_list_lock); 331 spin_unlock(&journal->j_list_lock);
335 __flush_batch(journal, bhs, batch_count); 332 __flush_batch(journal, batch_count);
336 ret = 1; 333 ret = 1;
337 } 334 }
338 } 335 }
@@ -388,7 +385,6 @@ restart:
388 if (journal->j_checkpoint_transactions == transaction && 385 if (journal->j_checkpoint_transactions == transaction &&
389 transaction->t_tid == this_tid) { 386 transaction->t_tid == this_tid) {
390 int batch_count = 0; 387 int batch_count = 0;
391 struct buffer_head *bhs[NR_BATCH];
392 struct journal_head *jh; 388 struct journal_head *jh;
393 int retry = 0, err; 389 int retry = 0, err;
394 390
@@ -402,7 +398,7 @@ restart:
402 retry = 1; 398 retry = 1;
403 break; 399 break;
404 } 400 }
405 retry = __process_buffer(journal, jh, bhs, &batch_count, 401 retry = __process_buffer(journal, jh, &batch_count,
406 transaction); 402 transaction);
407 if (retry < 0 && !result) 403 if (retry < 0 && !result)
408 result = retry; 404 result = retry;
@@ -419,7 +415,7 @@ restart:
419 spin_unlock(&journal->j_list_lock); 415 spin_unlock(&journal->j_list_lock);
420 retry = 1; 416 retry = 1;
421 } 417 }
422 __flush_batch(journal, bhs, &batch_count); 418 __flush_batch(journal, &batch_count);
423 } 419 }
424 420
425 if (retry) { 421 if (retry) {
@@ -686,6 +682,7 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
686 safely remove this transaction from the log */ 682 safely remove this transaction from the log */
687 683
688 __jbd2_journal_drop_transaction(journal, transaction); 684 __jbd2_journal_drop_transaction(journal, transaction);
685 kfree(transaction);
689 686
690 /* Just in case anybody was waiting for more transactions to be 687 /* Just in case anybody was waiting for more transactions to be
691 checkpointed... */ 688 checkpointed... */
@@ -760,5 +757,4 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
760 J_ASSERT(journal->j_running_transaction != transaction); 757 J_ASSERT(journal->j_running_transaction != transaction);
761 758
762 jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid); 759 jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
763 kfree(transaction);
764} 760}
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index ebc667bc54a8..62804e57a44c 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -25,6 +25,7 @@
25#include <linux/crc32.h> 25#include <linux/crc32.h>
26#include <linux/writeback.h> 26#include <linux/writeback.h>
27#include <linux/backing-dev.h> 27#include <linux/backing-dev.h>
28#include <linux/bio.h>
28 29
29/* 30/*
30 * Default IO end handler for temporary BJ_IO buffer_heads. 31 * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -137,7 +138,7 @@ static int journal_submit_commit_record(journal_t *journal,
137 set_buffer_ordered(bh); 138 set_buffer_ordered(bh);
138 barrier_done = 1; 139 barrier_done = 1;
139 } 140 }
140 ret = submit_bh(WRITE, bh); 141 ret = submit_bh(WRITE_SYNC, bh);
141 if (barrier_done) 142 if (barrier_done)
142 clear_buffer_ordered(bh); 143 clear_buffer_ordered(bh);
143 144
@@ -158,7 +159,7 @@ static int journal_submit_commit_record(journal_t *journal,
158 lock_buffer(bh); 159 lock_buffer(bh);
159 set_buffer_uptodate(bh); 160 set_buffer_uptodate(bh);
160 clear_buffer_dirty(bh); 161 clear_buffer_dirty(bh);
161 ret = submit_bh(WRITE, bh); 162 ret = submit_bh(WRITE_SYNC, bh);
162 } 163 }
163 *cbh = bh; 164 *cbh = bh;
164 return ret; 165 return ret;
@@ -168,12 +169,34 @@ static int journal_submit_commit_record(journal_t *journal,
168 * This function along with journal_submit_commit_record 169 * This function along with journal_submit_commit_record
169 * allows to write the commit record asynchronously. 170 * allows to write the commit record asynchronously.
170 */ 171 */
171static int journal_wait_on_commit_record(struct buffer_head *bh) 172static int journal_wait_on_commit_record(journal_t *journal,
173 struct buffer_head *bh)
172{ 174{
173 int ret = 0; 175 int ret = 0;
174 176
177retry:
175 clear_buffer_dirty(bh); 178 clear_buffer_dirty(bh);
176 wait_on_buffer(bh); 179 wait_on_buffer(bh);
180 if (buffer_eopnotsupp(bh) && (journal->j_flags & JBD2_BARRIER)) {
181 printk(KERN_WARNING
182 "JBD2: wait_on_commit_record: sync failed on %s - "
183 "disabling barriers\n", journal->j_devname);
184 spin_lock(&journal->j_state_lock);
185 journal->j_flags &= ~JBD2_BARRIER;
186 spin_unlock(&journal->j_state_lock);
187
188 lock_buffer(bh);
189 clear_buffer_dirty(bh);
190 set_buffer_uptodate(bh);
191 bh->b_end_io = journal_end_buffer_io_sync;
192
193 ret = submit_bh(WRITE_SYNC, bh);
194 if (ret) {
195 unlock_buffer(bh);
196 return ret;
197 }
198 goto retry;
199 }
177 200
178 if (unlikely(!buffer_uptodate(bh))) 201 if (unlikely(!buffer_uptodate(bh)))
179 ret = -EIO; 202 ret = -EIO;
@@ -332,13 +355,15 @@ void jbd2_journal_commit_transaction(journal_t *journal)
332 int flags; 355 int flags;
333 int err; 356 int err;
334 unsigned long long blocknr; 357 unsigned long long blocknr;
358 ktime_t start_time;
359 u64 commit_time;
335 char *tagp = NULL; 360 char *tagp = NULL;
336 journal_header_t *header; 361 journal_header_t *header;
337 journal_block_tag_t *tag = NULL; 362 journal_block_tag_t *tag = NULL;
338 int space_left = 0; 363 int space_left = 0;
339 int first_tag = 0; 364 int first_tag = 0;
340 int tag_flag; 365 int tag_flag;
341 int i; 366 int i, to_free = 0;
342 int tag_bytes = journal_tag_bytes(journal); 367 int tag_bytes = journal_tag_bytes(journal);
343 struct buffer_head *cbh = NULL; /* For transactional checksums */ 368 struct buffer_head *cbh = NULL; /* For transactional checksums */
344 __u32 crc32_sum = ~0; 369 __u32 crc32_sum = ~0;
@@ -458,6 +483,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
458 commit_transaction->t_state = T_FLUSH; 483 commit_transaction->t_state = T_FLUSH;
459 journal->j_committing_transaction = commit_transaction; 484 journal->j_committing_transaction = commit_transaction;
460 journal->j_running_transaction = NULL; 485 journal->j_running_transaction = NULL;
486 start_time = ktime_get();
461 commit_transaction->t_log_start = journal->j_head; 487 commit_transaction->t_log_start = journal->j_head;
462 wake_up(&journal->j_wait_transaction_locked); 488 wake_up(&journal->j_wait_transaction_locked);
463 spin_unlock(&journal->j_state_lock); 489 spin_unlock(&journal->j_state_lock);
@@ -509,6 +535,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
509 if (is_journal_aborted(journal)) { 535 if (is_journal_aborted(journal)) {
510 clear_buffer_jbddirty(jh2bh(jh)); 536 clear_buffer_jbddirty(jh2bh(jh));
511 JBUFFER_TRACE(jh, "journal is aborting: refile"); 537 JBUFFER_TRACE(jh, "journal is aborting: refile");
538 jbd2_buffer_abort_trigger(jh,
539 jh->b_frozen_data ?
540 jh->b_frozen_triggers :
541 jh->b_triggers);
512 jbd2_journal_refile_buffer(journal, jh); 542 jbd2_journal_refile_buffer(journal, jh);
513 /* If that was the last one, we need to clean up 543 /* If that was the last one, we need to clean up
514 * any descriptor buffers which may have been 544 * any descriptor buffers which may have been
@@ -799,7 +829,7 @@ wait_for_iobuf:
799 __jbd2_journal_abort_hard(journal); 829 __jbd2_journal_abort_hard(journal);
800 } 830 }
801 if (!err && !is_journal_aborted(journal)) 831 if (!err && !is_journal_aborted(journal))
802 err = journal_wait_on_commit_record(cbh); 832 err = journal_wait_on_commit_record(journal, cbh);
803 833
804 if (err) 834 if (err)
805 jbd2_journal_abort(journal, err); 835 jbd2_journal_abort(journal, err);
@@ -844,6 +874,9 @@ restart_loop:
844 * data. 874 * data.
845 * 875 *
846 * Otherwise, we can just throw away the frozen data now. 876 * Otherwise, we can just throw away the frozen data now.
877 *
878 * We also know that the frozen data has already fired
879 * its triggers if they exist, so we can clear that too.
847 */ 880 */
848 if (jh->b_committed_data) { 881 if (jh->b_committed_data) {
849 jbd2_free(jh->b_committed_data, bh->b_size); 882 jbd2_free(jh->b_committed_data, bh->b_size);
@@ -851,10 +884,12 @@ restart_loop:
851 if (jh->b_frozen_data) { 884 if (jh->b_frozen_data) {
852 jh->b_committed_data = jh->b_frozen_data; 885 jh->b_committed_data = jh->b_frozen_data;
853 jh->b_frozen_data = NULL; 886 jh->b_frozen_data = NULL;
887 jh->b_frozen_triggers = NULL;
854 } 888 }
855 } else if (jh->b_frozen_data) { 889 } else if (jh->b_frozen_data) {
856 jbd2_free(jh->b_frozen_data, bh->b_size); 890 jbd2_free(jh->b_frozen_data, bh->b_size);
857 jh->b_frozen_data = NULL; 891 jh->b_frozen_data = NULL;
892 jh->b_frozen_triggers = NULL;
858 } 893 }
859 894
860 spin_lock(&journal->j_list_lock); 895 spin_lock(&journal->j_list_lock);
@@ -972,14 +1007,23 @@ restart_loop:
972 J_ASSERT(commit_transaction == journal->j_committing_transaction); 1007 J_ASSERT(commit_transaction == journal->j_committing_transaction);
973 journal->j_commit_sequence = commit_transaction->t_tid; 1008 journal->j_commit_sequence = commit_transaction->t_tid;
974 journal->j_committing_transaction = NULL; 1009 journal->j_committing_transaction = NULL;
975 spin_unlock(&journal->j_state_lock); 1010 commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
976 1011
977 if (journal->j_commit_callback) 1012 /*
978 journal->j_commit_callback(journal, commit_transaction); 1013 * weight the commit time higher than the average time so we don't
1014 * react too strongly to vast changes in the commit time
1015 */
1016 if (likely(journal->j_average_commit_time))
1017 journal->j_average_commit_time = (commit_time +
1018 journal->j_average_commit_time*3) / 4;
1019 else
1020 journal->j_average_commit_time = commit_time;
1021 spin_unlock(&journal->j_state_lock);
979 1022
980 if (commit_transaction->t_checkpoint_list == NULL && 1023 if (commit_transaction->t_checkpoint_list == NULL &&
981 commit_transaction->t_checkpoint_io_list == NULL) { 1024 commit_transaction->t_checkpoint_io_list == NULL) {
982 __jbd2_journal_drop_transaction(journal, commit_transaction); 1025 __jbd2_journal_drop_transaction(journal, commit_transaction);
1026 to_free = 1;
983 } else { 1027 } else {
984 if (journal->j_checkpoint_transactions == NULL) { 1028 if (journal->j_checkpoint_transactions == NULL) {
985 journal->j_checkpoint_transactions = commit_transaction; 1029 journal->j_checkpoint_transactions = commit_transaction;
@@ -998,11 +1042,16 @@ restart_loop:
998 } 1042 }
999 spin_unlock(&journal->j_list_lock); 1043 spin_unlock(&journal->j_list_lock);
1000 1044
1045 if (journal->j_commit_callback)
1046 journal->j_commit_callback(journal, commit_transaction);
1047
1001 trace_mark(jbd2_end_commit, "dev %s transaction %d head %d", 1048 trace_mark(jbd2_end_commit, "dev %s transaction %d head %d",
1002 journal->j_devname, journal->j_commit_sequence, 1049 journal->j_devname, commit_transaction->t_tid,
1003 journal->j_tail_sequence); 1050 journal->j_tail_sequence);
1004 jbd_debug(1, "JBD: commit %d complete, head %d\n", 1051 jbd_debug(1, "JBD: commit %d complete, head %d\n",
1005 journal->j_commit_sequence, journal->j_tail_sequence); 1052 journal->j_commit_sequence, journal->j_tail_sequence);
1053 if (to_free)
1054 kfree(commit_transaction);
1006 1055
1007 wake_up(&journal->j_wait_done_commit); 1056 wake_up(&journal->j_wait_done_commit);
1008} 1057}
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index e70d657a19f8..eb343008eded 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -37,6 +37,7 @@
37#include <linux/proc_fs.h> 37#include <linux/proc_fs.h>
38#include <linux/debugfs.h> 38#include <linux/debugfs.h>
39#include <linux/seq_file.h> 39#include <linux/seq_file.h>
40#include <linux/math64.h>
40 41
41#include <asm/uaccess.h> 42#include <asm/uaccess.h>
42#include <asm/page.h> 43#include <asm/page.h>
@@ -50,6 +51,7 @@ EXPORT_SYMBOL(jbd2_journal_unlock_updates);
50EXPORT_SYMBOL(jbd2_journal_get_write_access); 51EXPORT_SYMBOL(jbd2_journal_get_write_access);
51EXPORT_SYMBOL(jbd2_journal_get_create_access); 52EXPORT_SYMBOL(jbd2_journal_get_create_access);
52EXPORT_SYMBOL(jbd2_journal_get_undo_access); 53EXPORT_SYMBOL(jbd2_journal_get_undo_access);
54EXPORT_SYMBOL(jbd2_journal_set_triggers);
53EXPORT_SYMBOL(jbd2_journal_dirty_metadata); 55EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
54EXPORT_SYMBOL(jbd2_journal_release_buffer); 56EXPORT_SYMBOL(jbd2_journal_release_buffer);
55EXPORT_SYMBOL(jbd2_journal_forget); 57EXPORT_SYMBOL(jbd2_journal_forget);
@@ -65,7 +67,6 @@ EXPORT_SYMBOL(jbd2_journal_update_format);
65EXPORT_SYMBOL(jbd2_journal_check_used_features); 67EXPORT_SYMBOL(jbd2_journal_check_used_features);
66EXPORT_SYMBOL(jbd2_journal_check_available_features); 68EXPORT_SYMBOL(jbd2_journal_check_available_features);
67EXPORT_SYMBOL(jbd2_journal_set_features); 69EXPORT_SYMBOL(jbd2_journal_set_features);
68EXPORT_SYMBOL(jbd2_journal_create);
69EXPORT_SYMBOL(jbd2_journal_load); 70EXPORT_SYMBOL(jbd2_journal_load);
70EXPORT_SYMBOL(jbd2_journal_destroy); 71EXPORT_SYMBOL(jbd2_journal_destroy);
71EXPORT_SYMBOL(jbd2_journal_abort); 72EXPORT_SYMBOL(jbd2_journal_abort);
@@ -131,8 +132,9 @@ static int kjournald2(void *arg)
131 journal->j_task = current; 132 journal->j_task = current;
132 wake_up(&journal->j_wait_done_commit); 133 wake_up(&journal->j_wait_done_commit);
133 134
134 printk(KERN_INFO "kjournald2 starting. Commit interval %ld seconds\n", 135 printk(KERN_INFO "kjournald2 starting: pid %d, dev %s, "
135 journal->j_commit_interval / HZ); 136 "commit interval %ld seconds\n", current->pid,
137 journal->j_devname, journal->j_commit_interval / HZ);
136 138
137 /* 139 /*
138 * And now, wait forever for commit wakeup events. 140 * And now, wait forever for commit wakeup events.
@@ -290,6 +292,7 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
290 struct page *new_page; 292 struct page *new_page;
291 unsigned int new_offset; 293 unsigned int new_offset;
292 struct buffer_head *bh_in = jh2bh(jh_in); 294 struct buffer_head *bh_in = jh2bh(jh_in);
295 struct jbd2_buffer_trigger_type *triggers;
293 296
294 /* 297 /*
295 * The buffer really shouldn't be locked: only the current committing 298 * The buffer really shouldn't be locked: only the current committing
@@ -314,13 +317,23 @@ repeat:
314 done_copy_out = 1; 317 done_copy_out = 1;
315 new_page = virt_to_page(jh_in->b_frozen_data); 318 new_page = virt_to_page(jh_in->b_frozen_data);
316 new_offset = offset_in_page(jh_in->b_frozen_data); 319 new_offset = offset_in_page(jh_in->b_frozen_data);
320 triggers = jh_in->b_frozen_triggers;
317 } else { 321 } else {
318 new_page = jh2bh(jh_in)->b_page; 322 new_page = jh2bh(jh_in)->b_page;
319 new_offset = offset_in_page(jh2bh(jh_in)->b_data); 323 new_offset = offset_in_page(jh2bh(jh_in)->b_data);
324 triggers = jh_in->b_triggers;
320 } 325 }
321 326
322 mapped_data = kmap_atomic(new_page, KM_USER0); 327 mapped_data = kmap_atomic(new_page, KM_USER0);
323 /* 328 /*
329 * Fire any commit trigger. Do this before checking for escaping,
330 * as the trigger may modify the magic offset. If a copy-out
331 * happens afterwards, it will have the correct data in the buffer.
332 */
333 jbd2_buffer_commit_trigger(jh_in, mapped_data + new_offset,
334 triggers);
335
336 /*
324 * Check for escaping 337 * Check for escaping
325 */ 338 */
326 if (*((__be32 *)(mapped_data + new_offset)) == 339 if (*((__be32 *)(mapped_data + new_offset)) ==
@@ -352,6 +365,13 @@ repeat:
352 new_page = virt_to_page(tmp); 365 new_page = virt_to_page(tmp);
353 new_offset = offset_in_page(tmp); 366 new_offset = offset_in_page(tmp);
354 done_copy_out = 1; 367 done_copy_out = 1;
368
369 /*
370 * This isn't strictly necessary, as we're using frozen
371 * data for the escaping, but it keeps consistency with
372 * b_frozen_data usage.
373 */
374 jh_in->b_frozen_triggers = jh_in->b_triggers;
355 } 375 }
356 376
357 /* 377 /*
@@ -631,6 +651,8 @@ struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
631 return NULL; 651 return NULL;
632 652
633 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); 653 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
654 if (!bh)
655 return NULL;
634 lock_buffer(bh); 656 lock_buffer(bh);
635 memset(bh->b_data, 0, journal->j_blocksize); 657 memset(bh->b_data, 0, journal->j_blocksize);
636 set_buffer_uptodate(bh); 658 set_buffer_uptodate(bh);
@@ -824,6 +846,8 @@ static int jbd2_seq_info_show(struct seq_file *seq, void *v)
824 jiffies_to_msecs(s->stats->u.run.rs_flushing / s->stats->ts_tid)); 846 jiffies_to_msecs(s->stats->u.run.rs_flushing / s->stats->ts_tid));
825 seq_printf(seq, " %ums logging transaction\n", 847 seq_printf(seq, " %ums logging transaction\n",
826 jiffies_to_msecs(s->stats->u.run.rs_logging / s->stats->ts_tid)); 848 jiffies_to_msecs(s->stats->u.run.rs_logging / s->stats->ts_tid));
849 seq_printf(seq, " %lluus average transaction commit time\n",
850 div_u64(s->journal->j_average_commit_time, 1000));
827 seq_printf(seq, " %lu handles per transaction\n", 851 seq_printf(seq, " %lu handles per transaction\n",
828 s->stats->u.run.rs_handle_count / s->stats->ts_tid); 852 s->stats->u.run.rs_handle_count / s->stats->ts_tid);
829 seq_printf(seq, " %lu blocks per transaction\n", 853 seq_printf(seq, " %lu blocks per transaction\n",
@@ -961,6 +985,8 @@ static journal_t * journal_init_common (void)
961 spin_lock_init(&journal->j_state_lock); 985 spin_lock_init(&journal->j_state_lock);
962 986
963 journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE); 987 journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE);
988 journal->j_min_batch_time = 0;
989 journal->j_max_batch_time = 15000; /* 15ms */
964 990
965 /* The journal is marked for error until we succeed with recovery! */ 991 /* The journal is marked for error until we succeed with recovery! */
966 journal->j_flags = JBD2_ABORT; 992 journal->j_flags = JBD2_ABORT;
@@ -1016,15 +1042,14 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
1016 1042
1017 /* journal descriptor can store up to n blocks -bzzz */ 1043 /* journal descriptor can store up to n blocks -bzzz */
1018 journal->j_blocksize = blocksize; 1044 journal->j_blocksize = blocksize;
1045 jbd2_stats_proc_init(journal);
1019 n = journal->j_blocksize / sizeof(journal_block_tag_t); 1046 n = journal->j_blocksize / sizeof(journal_block_tag_t);
1020 journal->j_wbufsize = n; 1047 journal->j_wbufsize = n;
1021 journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL); 1048 journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL);
1022 if (!journal->j_wbuf) { 1049 if (!journal->j_wbuf) {
1023 printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n", 1050 printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
1024 __func__); 1051 __func__);
1025 kfree(journal); 1052 goto out_err;
1026 journal = NULL;
1027 goto out;
1028 } 1053 }
1029 journal->j_dev = bdev; 1054 journal->j_dev = bdev;
1030 journal->j_fs_dev = fs_dev; 1055 journal->j_fs_dev = fs_dev;
@@ -1034,14 +1059,22 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
1034 p = journal->j_devname; 1059 p = journal->j_devname;
1035 while ((p = strchr(p, '/'))) 1060 while ((p = strchr(p, '/')))
1036 *p = '!'; 1061 *p = '!';
1037 jbd2_stats_proc_init(journal);
1038 1062
1039 bh = __getblk(journal->j_dev, start, journal->j_blocksize); 1063 bh = __getblk(journal->j_dev, start, journal->j_blocksize);
1040 J_ASSERT(bh != NULL); 1064 if (!bh) {
1065 printk(KERN_ERR
1066 "%s: Cannot get buffer for journal superblock\n",
1067 __func__);
1068 goto out_err;
1069 }
1041 journal->j_sb_buffer = bh; 1070 journal->j_sb_buffer = bh;
1042 journal->j_superblock = (journal_superblock_t *)bh->b_data; 1071 journal->j_superblock = (journal_superblock_t *)bh->b_data;
1043out: 1072
1044 return journal; 1073 return journal;
1074out_err:
1075 jbd2_stats_proc_exit(journal);
1076 kfree(journal);
1077 return NULL;
1045} 1078}
1046 1079
1047/** 1080/**
@@ -1089,9 +1122,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
1089 if (!journal->j_wbuf) { 1122 if (!journal->j_wbuf) {
1090 printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n", 1123 printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
1091 __func__); 1124 __func__);
1092 jbd2_stats_proc_exit(journal); 1125 goto out_err;
1093 kfree(journal);
1094 return NULL;
1095 } 1126 }
1096 1127
1097 err = jbd2_journal_bmap(journal, 0, &blocknr); 1128 err = jbd2_journal_bmap(journal, 0, &blocknr);
@@ -1099,17 +1130,24 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
1099 if (err) { 1130 if (err) {
1100 printk(KERN_ERR "%s: Cannnot locate journal superblock\n", 1131 printk(KERN_ERR "%s: Cannnot locate journal superblock\n",
1101 __func__); 1132 __func__);
1102 jbd2_stats_proc_exit(journal); 1133 goto out_err;
1103 kfree(journal);
1104 return NULL;
1105 } 1134 }
1106 1135
1107 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); 1136 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
1108 J_ASSERT(bh != NULL); 1137 if (!bh) {
1138 printk(KERN_ERR
1139 "%s: Cannot get buffer for journal superblock\n",
1140 __func__);
1141 goto out_err;
1142 }
1109 journal->j_sb_buffer = bh; 1143 journal->j_sb_buffer = bh;
1110 journal->j_superblock = (journal_superblock_t *)bh->b_data; 1144 journal->j_superblock = (journal_superblock_t *)bh->b_data;
1111 1145
1112 return journal; 1146 return journal;
1147out_err:
1148 jbd2_stats_proc_exit(journal);
1149 kfree(journal);
1150 return NULL;
1113} 1151}
1114 1152
1115/* 1153/*
@@ -1158,77 +1196,6 @@ static int journal_reset(journal_t *journal)
1158} 1196}
1159 1197
1160/** 1198/**
1161 * int jbd2_journal_create() - Initialise the new journal file
1162 * @journal: Journal to create. This structure must have been initialised
1163 *
1164 * Given a journal_t structure which tells us which disk blocks we can
1165 * use, create a new journal superblock and initialise all of the
1166 * journal fields from scratch.
1167 **/
1168int jbd2_journal_create(journal_t *journal)
1169{
1170 unsigned long long blocknr;
1171 struct buffer_head *bh;
1172 journal_superblock_t *sb;
1173 int i, err;
1174
1175 if (journal->j_maxlen < JBD2_MIN_JOURNAL_BLOCKS) {
1176 printk (KERN_ERR "Journal length (%d blocks) too short.\n",
1177 journal->j_maxlen);
1178 journal_fail_superblock(journal);
1179 return -EINVAL;
1180 }
1181
1182 if (journal->j_inode == NULL) {
1183 /*
1184 * We don't know what block to start at!
1185 */
1186 printk(KERN_EMERG
1187 "%s: creation of journal on external device!\n",
1188 __func__);
1189 BUG();
1190 }
1191
1192 /* Zero out the entire journal on disk. We cannot afford to
1193 have any blocks on disk beginning with JBD2_MAGIC_NUMBER. */
1194 jbd_debug(1, "JBD: Zeroing out journal blocks...\n");
1195 for (i = 0; i < journal->j_maxlen; i++) {
1196 err = jbd2_journal_bmap(journal, i, &blocknr);
1197 if (err)
1198 return err;
1199 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
1200 lock_buffer(bh);
1201 memset (bh->b_data, 0, journal->j_blocksize);
1202 BUFFER_TRACE(bh, "marking dirty");
1203 mark_buffer_dirty(bh);
1204 BUFFER_TRACE(bh, "marking uptodate");
1205 set_buffer_uptodate(bh);
1206 unlock_buffer(bh);
1207 __brelse(bh);
1208 }
1209
1210 sync_blockdev(journal->j_dev);
1211 jbd_debug(1, "JBD: journal cleared.\n");
1212
1213 /* OK, fill in the initial static fields in the new superblock */
1214 sb = journal->j_superblock;
1215
1216 sb->s_header.h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
1217 sb->s_header.h_blocktype = cpu_to_be32(JBD2_SUPERBLOCK_V2);
1218
1219 sb->s_blocksize = cpu_to_be32(journal->j_blocksize);
1220 sb->s_maxlen = cpu_to_be32(journal->j_maxlen);
1221 sb->s_first = cpu_to_be32(1);
1222
1223 journal->j_transaction_sequence = 1;
1224
1225 journal->j_flags &= ~JBD2_ABORT;
1226 journal->j_format_version = 2;
1227
1228 return journal_reset(journal);
1229}
1230
1231/**
1232 * void jbd2_journal_update_superblock() - Update journal sb on disk. 1199 * void jbd2_journal_update_superblock() - Update journal sb on disk.
1233 * @journal: The journal to update. 1200 * @journal: The journal to update.
1234 * @wait: Set to '0' if you don't want to wait for IO completion. 1201 * @wait: Set to '0' if you don't want to wait for IO completion.
@@ -1472,7 +1439,9 @@ int jbd2_journal_destroy(journal_t *journal)
1472 spin_lock(&journal->j_list_lock); 1439 spin_lock(&journal->j_list_lock);
1473 while (journal->j_checkpoint_transactions != NULL) { 1440 while (journal->j_checkpoint_transactions != NULL) {
1474 spin_unlock(&journal->j_list_lock); 1441 spin_unlock(&journal->j_list_lock);
1442 mutex_lock(&journal->j_checkpoint_mutex);
1475 jbd2_log_do_checkpoint(journal); 1443 jbd2_log_do_checkpoint(journal);
1444 mutex_unlock(&journal->j_checkpoint_mutex);
1476 spin_lock(&journal->j_list_lock); 1445 spin_lock(&journal->j_list_lock);
1477 } 1446 }
1478 1447
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 39b7805a599a..46b4e347ed7d 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -25,6 +25,7 @@
25#include <linux/timer.h> 25#include <linux/timer.h>
26#include <linux/mm.h> 26#include <linux/mm.h>
27#include <linux/highmem.h> 27#include <linux/highmem.h>
28#include <linux/hrtimer.h>
28 29
29static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh); 30static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
30 31
@@ -48,6 +49,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
48{ 49{
49 transaction->t_journal = journal; 50 transaction->t_journal = journal;
50 transaction->t_state = T_RUNNING; 51 transaction->t_state = T_RUNNING;
52 transaction->t_start_time = ktime_get();
51 transaction->t_tid = journal->j_transaction_sequence++; 53 transaction->t_tid = journal->j_transaction_sequence++;
52 transaction->t_expires = jiffies + journal->j_commit_interval; 54 transaction->t_expires = jiffies + journal->j_commit_interval;
53 spin_lock_init(&transaction->t_handle_lock); 55 spin_lock_init(&transaction->t_handle_lock);
@@ -741,6 +743,12 @@ done:
741 source = kmap_atomic(page, KM_USER0); 743 source = kmap_atomic(page, KM_USER0);
742 memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size); 744 memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
743 kunmap_atomic(source, KM_USER0); 745 kunmap_atomic(source, KM_USER0);
746
747 /*
748 * Now that the frozen data is saved off, we need to store
749 * any matching triggers.
750 */
751 jh->b_frozen_triggers = jh->b_triggers;
744 } 752 }
745 jbd_unlock_bh_state(bh); 753 jbd_unlock_bh_state(bh);
746 754
@@ -944,6 +952,47 @@ out:
944} 952}
945 953
946/** 954/**
955 * void jbd2_journal_set_triggers() - Add triggers for commit writeout
956 * @bh: buffer to trigger on
957 * @type: struct jbd2_buffer_trigger_type containing the trigger(s).
958 *
959 * Set any triggers on this journal_head. This is always safe, because
960 * triggers for a committing buffer will be saved off, and triggers for
961 * a running transaction will match the buffer in that transaction.
962 *
963 * Call with NULL to clear the triggers.
964 */
965void jbd2_journal_set_triggers(struct buffer_head *bh,
966 struct jbd2_buffer_trigger_type *type)
967{
968 struct journal_head *jh = bh2jh(bh);
969
970 jh->b_triggers = type;
971}
972
973void jbd2_buffer_commit_trigger(struct journal_head *jh, void *mapped_data,
974 struct jbd2_buffer_trigger_type *triggers)
975{
976 struct buffer_head *bh = jh2bh(jh);
977
978 if (!triggers || !triggers->t_commit)
979 return;
980
981 triggers->t_commit(triggers, bh, mapped_data, bh->b_size);
982}
983
984void jbd2_buffer_abort_trigger(struct journal_head *jh,
985 struct jbd2_buffer_trigger_type *triggers)
986{
987 if (!triggers || !triggers->t_abort)
988 return;
989
990 triggers->t_abort(triggers, jh2bh(jh));
991}
992
993
994
995/**
947 * int jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata 996 * int jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata
948 * @handle: transaction to add buffer to. 997 * @handle: transaction to add buffer to.
949 * @bh: buffer to mark 998 * @bh: buffer to mark
@@ -1193,7 +1242,7 @@ int jbd2_journal_stop(handle_t *handle)
1193{ 1242{
1194 transaction_t *transaction = handle->h_transaction; 1243 transaction_t *transaction = handle->h_transaction;
1195 journal_t *journal = transaction->t_journal; 1244 journal_t *journal = transaction->t_journal;
1196 int old_handle_count, err; 1245 int err;
1197 pid_t pid; 1246 pid_t pid;
1198 1247
1199 J_ASSERT(journal_current_handle() == handle); 1248 J_ASSERT(journal_current_handle() == handle);
@@ -1216,24 +1265,54 @@ int jbd2_journal_stop(handle_t *handle)
1216 /* 1265 /*
1217 * Implement synchronous transaction batching. If the handle 1266 * Implement synchronous transaction batching. If the handle
1218 * was synchronous, don't force a commit immediately. Let's 1267 * was synchronous, don't force a commit immediately. Let's
1219 * yield and let another thread piggyback onto this transaction. 1268 * yield and let another thread piggyback onto this
1220 * Keep doing that while new threads continue to arrive. 1269 * transaction. Keep doing that while new threads continue to
1221 * It doesn't cost much - we're about to run a commit and sleep 1270 * arrive. It doesn't cost much - we're about to run a commit
1222 * on IO anyway. Speeds up many-threaded, many-dir operations 1271 * and sleep on IO anyway. Speeds up many-threaded, many-dir
1223 * by 30x or more... 1272 * operations by 30x or more...
1224 * 1273 *
1225 * But don't do this if this process was the most recent one to 1274 * We try and optimize the sleep time against what the
1226 * perform a synchronous write. We do this to detect the case where a 1275 * underlying disk can do, instead of having a static sleep
1227 * single process is doing a stream of sync writes. No point in waiting 1276 * time. This is useful for the case where our storage is so
1228 * for joiners in that case. 1277 * fast that it is more optimal to go ahead and force a flush
1278 * and wait for the transaction to be committed than it is to
1279 * wait for an arbitrary amount of time for new writers to
1280 * join the transaction. We achieve this by measuring how
1281 * long it takes to commit a transaction, and compare it with
1282 * how long this transaction has been running, and if run time
1283 * < commit time then we sleep for the delta and commit. This
1284 * greatly helps super fast disks that would see slowdowns as
1285 * more threads started doing fsyncs.
1286 *
1287 * But don't do this if this process was the most recent one
1288 * to perform a synchronous write. We do this to detect the
1289 * case where a single process is doing a stream of sync
1290 * writes. No point in waiting for joiners in that case.
1229 */ 1291 */
1230 pid = current->pid; 1292 pid = current->pid;
1231 if (handle->h_sync && journal->j_last_sync_writer != pid) { 1293 if (handle->h_sync && journal->j_last_sync_writer != pid) {
1294 u64 commit_time, trans_time;
1295
1232 journal->j_last_sync_writer = pid; 1296 journal->j_last_sync_writer = pid;
1233 do { 1297
1234 old_handle_count = transaction->t_handle_count; 1298 spin_lock(&journal->j_state_lock);
1235 schedule_timeout_uninterruptible(1); 1299 commit_time = journal->j_average_commit_time;
1236 } while (old_handle_count != transaction->t_handle_count); 1300 spin_unlock(&journal->j_state_lock);
1301
1302 trans_time = ktime_to_ns(ktime_sub(ktime_get(),
1303 transaction->t_start_time));
1304
1305 commit_time = max_t(u64, commit_time,
1306 1000*journal->j_min_batch_time);
1307 commit_time = min_t(u64, commit_time,
1308 1000*journal->j_max_batch_time);
1309
1310 if (trans_time < commit_time) {
1311 ktime_t expires = ktime_add_ns(ktime_get(),
1312 commit_time);
1313 set_current_state(TASK_UNINTERRUPTIBLE);
1314 schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
1315 }
1237 } 1316 }
1238 1317
1239 current->journal_info = NULL; 1318 current->journal_info = NULL;
diff --git a/fs/jffs2/compr_rubin.c b/fs/jffs2/compr_rubin.c
index c73fa89b5f8a..170d289ac785 100644
--- a/fs/jffs2/compr_rubin.c
+++ b/fs/jffs2/compr_rubin.c
@@ -22,9 +22,7 @@
22 22
23 23
24#define BIT_DIVIDER_MIPS 1043 24#define BIT_DIVIDER_MIPS 1043
25static int bits_mips[8] = { 277,249,290,267,229,341,212,241}; /* mips32 */ 25static int bits_mips[8] = { 277, 249, 290, 267, 229, 341, 212, 241};
26
27#include <linux/errno.h>
28 26
29struct pushpull { 27struct pushpull {
30 unsigned char *buf; 28 unsigned char *buf;
@@ -43,7 +41,9 @@ struct rubin_state {
43 int bits[8]; 41 int bits[8];
44}; 42};
45 43
46static inline void init_pushpull(struct pushpull *pp, char *buf, unsigned buflen, unsigned ofs, unsigned reserve) 44static inline void init_pushpull(struct pushpull *pp, char *buf,
45 unsigned buflen, unsigned ofs,
46 unsigned reserve)
47{ 47{
48 pp->buf = buf; 48 pp->buf = buf;
49 pp->buflen = buflen; 49 pp->buflen = buflen;
@@ -53,16 +53,14 @@ static inline void init_pushpull(struct pushpull *pp, char *buf, unsigned buflen
53 53
54static inline int pushbit(struct pushpull *pp, int bit, int use_reserved) 54static inline int pushbit(struct pushpull *pp, int bit, int use_reserved)
55{ 55{
56 if (pp->ofs >= pp->buflen - (use_reserved?0:pp->reserve)) { 56 if (pp->ofs >= pp->buflen - (use_reserved?0:pp->reserve))
57 return -ENOSPC; 57 return -ENOSPC;
58 }
59 58
60 if (bit) { 59 if (bit)
61 pp->buf[pp->ofs >> 3] |= (1<<(7-(pp->ofs &7))); 60 pp->buf[pp->ofs >> 3] |= (1<<(7-(pp->ofs & 7)));
62 } 61 else
63 else { 62 pp->buf[pp->ofs >> 3] &= ~(1<<(7-(pp->ofs & 7)));
64 pp->buf[pp->ofs >> 3] &= ~(1<<(7-(pp->ofs &7))); 63
65 }
66 pp->ofs++; 64 pp->ofs++;
67 65
68 return 0; 66 return 0;
@@ -97,6 +95,7 @@ static void init_rubin(struct rubin_state *rs, int div, int *bits)
97 rs->p = (long) (2 * UPPER_BIT_RUBIN); 95 rs->p = (long) (2 * UPPER_BIT_RUBIN);
98 rs->bit_number = (long) 0; 96 rs->bit_number = (long) 0;
99 rs->bit_divider = div; 97 rs->bit_divider = div;
98
100 for (c=0; c<8; c++) 99 for (c=0; c<8; c++)
101 rs->bits[c] = bits[c]; 100 rs->bits[c] = bits[c];
102} 101}
@@ -108,7 +107,8 @@ static int encode(struct rubin_state *rs, long A, long B, int symbol)
108 long i0, i1; 107 long i0, i1;
109 int ret; 108 int ret;
110 109
111 while ((rs->q >= UPPER_BIT_RUBIN) || ((rs->p + rs->q) <= UPPER_BIT_RUBIN)) { 110 while ((rs->q >= UPPER_BIT_RUBIN) ||
111 ((rs->p + rs->q) <= UPPER_BIT_RUBIN)) {
112 rs->bit_number++; 112 rs->bit_number++;
113 113
114 ret = pushbit(&rs->pp, (rs->q & UPPER_BIT_RUBIN) ? 1 : 0, 0); 114 ret = pushbit(&rs->pp, (rs->q & UPPER_BIT_RUBIN) ? 1 : 0, 0);
@@ -119,12 +119,12 @@ static int encode(struct rubin_state *rs, long A, long B, int symbol)
119 rs->p <<= 1; 119 rs->p <<= 1;
120 } 120 }
121 i0 = A * rs->p / (A + B); 121 i0 = A * rs->p / (A + B);
122 if (i0 <= 0) { 122 if (i0 <= 0)
123 i0 = 1; 123 i0 = 1;
124 } 124
125 if (i0 >= rs->p) { 125 if (i0 >= rs->p)
126 i0 = rs->p - 1; 126 i0 = rs->p - 1;
127 } 127
128 i1 = rs->p - i0; 128 i1 = rs->p - i0;
129 129
130 if (symbol == 0) 130 if (symbol == 0)
@@ -157,11 +157,13 @@ static void init_decode(struct rubin_state *rs, int div, int *bits)
157 /* behalve lower */ 157 /* behalve lower */
158 rs->rec_q = 0; 158 rs->rec_q = 0;
159 159
160 for (rs->bit_number = 0; rs->bit_number++ < RUBIN_REG_SIZE; rs->rec_q = rs->rec_q * 2 + (long) (pullbit(&rs->pp))) 160 for (rs->bit_number = 0; rs->bit_number++ < RUBIN_REG_SIZE;
161 rs->rec_q = rs->rec_q * 2 + (long) (pullbit(&rs->pp)))
161 ; 162 ;
162} 163}
163 164
164static void __do_decode(struct rubin_state *rs, unsigned long p, unsigned long q) 165static void __do_decode(struct rubin_state *rs, unsigned long p,
166 unsigned long q)
165{ 167{
166 register unsigned long lower_bits_rubin = LOWER_BITS_RUBIN; 168 register unsigned long lower_bits_rubin = LOWER_BITS_RUBIN;
167 unsigned long rec_q; 169 unsigned long rec_q;
@@ -207,12 +209,11 @@ static int decode(struct rubin_state *rs, long A, long B)
207 __do_decode(rs, p, q); 209 __do_decode(rs, p, q);
208 210
209 i0 = A * rs->p / (A + B); 211 i0 = A * rs->p / (A + B);
210 if (i0 <= 0) { 212 if (i0 <= 0)
211 i0 = 1; 213 i0 = 1;
212 } 214
213 if (i0 >= rs->p) { 215 if (i0 >= rs->p)
214 i0 = rs->p - 1; 216 i0 = rs->p - 1;
215 }
216 217
217 threshold = rs->q + i0; 218 threshold = rs->q + i0;
218 symbol = rs->rec_q >= threshold; 219 symbol = rs->rec_q >= threshold;
@@ -234,14 +235,15 @@ static int out_byte(struct rubin_state *rs, unsigned char byte)
234 struct rubin_state rs_copy; 235 struct rubin_state rs_copy;
235 rs_copy = *rs; 236 rs_copy = *rs;
236 237
237 for (i=0;i<8;i++) { 238 for (i=0; i<8; i++) {
238 ret = encode(rs, rs->bit_divider-rs->bits[i],rs->bits[i],byte&1); 239 ret = encode(rs, rs->bit_divider-rs->bits[i],
240 rs->bits[i], byte & 1);
239 if (ret) { 241 if (ret) {
240 /* Failed. Restore old state */ 242 /* Failed. Restore old state */
241 *rs = rs_copy; 243 *rs = rs_copy;
242 return ret; 244 return ret;
243 } 245 }
244 byte=byte>>1; 246 byte >>= 1 ;
245 } 247 }
246 return 0; 248 return 0;
247} 249}
@@ -251,7 +253,8 @@ static int in_byte(struct rubin_state *rs)
251 int i, result = 0, bit_divider = rs->bit_divider; 253 int i, result = 0, bit_divider = rs->bit_divider;
252 254
253 for (i = 0; i < 8; i++) 255 for (i = 0; i < 8; i++)
254 result |= decode(rs, bit_divider - rs->bits[i], rs->bits[i]) << i; 256 result |= decode(rs, bit_divider - rs->bits[i],
257 rs->bits[i]) << i;
255 258
256 return result; 259 return result;
257} 260}
@@ -259,7 +262,8 @@ static int in_byte(struct rubin_state *rs)
259 262
260 263
261static int rubin_do_compress(int bit_divider, int *bits, unsigned char *data_in, 264static int rubin_do_compress(int bit_divider, int *bits, unsigned char *data_in,
262 unsigned char *cpage_out, uint32_t *sourcelen, uint32_t *dstlen) 265 unsigned char *cpage_out, uint32_t *sourcelen,
266 uint32_t *dstlen)
263 { 267 {
264 int outpos = 0; 268 int outpos = 0;
265 int pos=0; 269 int pos=0;
@@ -295,7 +299,8 @@ static int rubin_do_compress(int bit_divider, int *bits, unsigned char *data_in,
295int jffs2_rubinmips_compress(unsigned char *data_in, unsigned char *cpage_out, 299int jffs2_rubinmips_compress(unsigned char *data_in, unsigned char *cpage_out,
296 uint32_t *sourcelen, uint32_t *dstlen, void *model) 300 uint32_t *sourcelen, uint32_t *dstlen, void *model)
297{ 301{
298 return rubin_do_compress(BIT_DIVIDER_MIPS, bits_mips, data_in, cpage_out, sourcelen, dstlen); 302 return rubin_do_compress(BIT_DIVIDER_MIPS, bits_mips, data_in,
303 cpage_out, sourcelen, dstlen);
299} 304}
300#endif 305#endif
301static int jffs2_dynrubin_compress(unsigned char *data_in, 306static int jffs2_dynrubin_compress(unsigned char *data_in,
@@ -316,9 +321,8 @@ static int jffs2_dynrubin_compress(unsigned char *data_in,
316 return -1; 321 return -1;
317 322
318 memset(histo, 0, 256); 323 memset(histo, 0, 256);
319 for (i=0; i<mysrclen; i++) { 324 for (i=0; i<mysrclen; i++)
320 histo[data_in[i]]++; 325 histo[data_in[i]]++;
321 }
322 memset(bits, 0, sizeof(int)*8); 326 memset(bits, 0, sizeof(int)*8);
323 for (i=0; i<256; i++) { 327 for (i=0; i<256; i++) {
324 if (i&128) 328 if (i&128)
@@ -346,7 +350,8 @@ static int jffs2_dynrubin_compress(unsigned char *data_in,
346 cpage_out[i] = bits[i]; 350 cpage_out[i] = bits[i];
347 } 351 }
348 352
349 ret = rubin_do_compress(256, bits, data_in, cpage_out+8, &mysrclen, &mydstlen); 353 ret = rubin_do_compress(256, bits, data_in, cpage_out+8, &mysrclen,
354 &mydstlen);
350 if (ret) 355 if (ret)
351 return ret; 356 return ret;
352 357
@@ -363,8 +368,10 @@ static int jffs2_dynrubin_compress(unsigned char *data_in,
363 return 0; 368 return 0;
364} 369}
365 370
366static void rubin_do_decompress(int bit_divider, int *bits, unsigned char *cdata_in, 371static void rubin_do_decompress(int bit_divider, int *bits,
367 unsigned char *page_out, uint32_t srclen, uint32_t destlen) 372 unsigned char *cdata_in,
373 unsigned char *page_out, uint32_t srclen,
374 uint32_t destlen)
368{ 375{
369 int outpos = 0; 376 int outpos = 0;
370 struct rubin_state rs; 377 struct rubin_state rs;
@@ -372,9 +379,8 @@ static void rubin_do_decompress(int bit_divider, int *bits, unsigned char *cdata
372 init_pushpull(&rs.pp, cdata_in, srclen, 0, 0); 379 init_pushpull(&rs.pp, cdata_in, srclen, 0, 0);
373 init_decode(&rs, bit_divider, bits); 380 init_decode(&rs, bit_divider, bits);
374 381
375 while (outpos < destlen) { 382 while (outpos < destlen)
376 page_out[outpos++] = in_byte(&rs); 383 page_out[outpos++] = in_byte(&rs);
377 }
378} 384}
379 385
380 386
@@ -383,7 +389,8 @@ static int jffs2_rubinmips_decompress(unsigned char *data_in,
383 uint32_t sourcelen, uint32_t dstlen, 389 uint32_t sourcelen, uint32_t dstlen,
384 void *model) 390 void *model)
385{ 391{
386 rubin_do_decompress(BIT_DIVIDER_MIPS, bits_mips, data_in, cpage_out, sourcelen, dstlen); 392 rubin_do_decompress(BIT_DIVIDER_MIPS, bits_mips, data_in,
393 cpage_out, sourcelen, dstlen);
387 return 0; 394 return 0;
388} 395}
389 396
@@ -398,52 +405,53 @@ static int jffs2_dynrubin_decompress(unsigned char *data_in,
398 for (c=0; c<8; c++) 405 for (c=0; c<8; c++)
399 bits[c] = data_in[c]; 406 bits[c] = data_in[c];
400 407
401 rubin_do_decompress(256, bits, data_in+8, cpage_out, sourcelen-8, dstlen); 408 rubin_do_decompress(256, bits, data_in+8, cpage_out, sourcelen-8,
409 dstlen);
402 return 0; 410 return 0;
403} 411}
404 412
405static struct jffs2_compressor jffs2_rubinmips_comp = { 413static struct jffs2_compressor jffs2_rubinmips_comp = {
406 .priority = JFFS2_RUBINMIPS_PRIORITY, 414 .priority = JFFS2_RUBINMIPS_PRIORITY,
407 .name = "rubinmips", 415 .name = "rubinmips",
408 .compr = JFFS2_COMPR_DYNRUBIN, 416 .compr = JFFS2_COMPR_DYNRUBIN,
409 .compress = NULL, /*&jffs2_rubinmips_compress,*/ 417 .compress = NULL, /*&jffs2_rubinmips_compress,*/
410 .decompress = &jffs2_rubinmips_decompress, 418 .decompress = &jffs2_rubinmips_decompress,
411#ifdef JFFS2_RUBINMIPS_DISABLED 419#ifdef JFFS2_RUBINMIPS_DISABLED
412 .disabled = 1, 420 .disabled = 1,
413#else 421#else
414 .disabled = 0, 422 .disabled = 0,
415#endif 423#endif
416}; 424};
417 425
418int jffs2_rubinmips_init(void) 426int jffs2_rubinmips_init(void)
419{ 427{
420 return jffs2_register_compressor(&jffs2_rubinmips_comp); 428 return jffs2_register_compressor(&jffs2_rubinmips_comp);
421} 429}
422 430
423void jffs2_rubinmips_exit(void) 431void jffs2_rubinmips_exit(void)
424{ 432{
425 jffs2_unregister_compressor(&jffs2_rubinmips_comp); 433 jffs2_unregister_compressor(&jffs2_rubinmips_comp);
426} 434}
427 435
428static struct jffs2_compressor jffs2_dynrubin_comp = { 436static struct jffs2_compressor jffs2_dynrubin_comp = {
429 .priority = JFFS2_DYNRUBIN_PRIORITY, 437 .priority = JFFS2_DYNRUBIN_PRIORITY,
430 .name = "dynrubin", 438 .name = "dynrubin",
431 .compr = JFFS2_COMPR_RUBINMIPS, 439 .compr = JFFS2_COMPR_RUBINMIPS,
432 .compress = jffs2_dynrubin_compress, 440 .compress = jffs2_dynrubin_compress,
433 .decompress = &jffs2_dynrubin_decompress, 441 .decompress = &jffs2_dynrubin_decompress,
434#ifdef JFFS2_DYNRUBIN_DISABLED 442#ifdef JFFS2_DYNRUBIN_DISABLED
435 .disabled = 1, 443 .disabled = 1,
436#else 444#else
437 .disabled = 0, 445 .disabled = 0,
438#endif 446#endif
439}; 447};
440 448
441int jffs2_dynrubin_init(void) 449int jffs2_dynrubin_init(void)
442{ 450{
443 return jffs2_register_compressor(&jffs2_dynrubin_comp); 451 return jffs2_register_compressor(&jffs2_dynrubin_comp);
444} 452}
445 453
446void jffs2_dynrubin_exit(void) 454void jffs2_dynrubin_exit(void)
447{ 455{
448 jffs2_unregister_compressor(&jffs2_dynrubin_comp); 456 jffs2_unregister_compressor(&jffs2_dynrubin_comp);
449} 457}
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index 259461b910af..c32b4a1ad6cf 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -175,7 +175,7 @@ static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock
175{ 175{
176 /* For NAND, if the failure did not occur at the device level for a 176 /* For NAND, if the failure did not occur at the device level for a
177 specific physical page, don't bother updating the bad block table. */ 177 specific physical page, don't bother updating the bad block table. */
178 if (jffs2_cleanmarker_oob(c) && (bad_offset != MTD_FAIL_ADDR_UNKNOWN)) { 178 if (jffs2_cleanmarker_oob(c) && (bad_offset != (uint32_t)MTD_FAIL_ADDR_UNKNOWN)) {
179 /* We had a device-level failure to erase. Let's see if we've 179 /* We had a device-level failure to erase. Let's see if we've
180 failed too many times. */ 180 failed too many times. */
181 if (!jffs2_write_nand_badblock(c, jeb, bad_offset)) { 181 if (!jffs2_write_nand_badblock(c, jeb, bad_offset)) {
@@ -209,7 +209,8 @@ static void jffs2_erase_callback(struct erase_info *instr)
209 struct erase_priv_struct *priv = (void *)instr->priv; 209 struct erase_priv_struct *priv = (void *)instr->priv;
210 210
211 if(instr->state != MTD_ERASE_DONE) { 211 if(instr->state != MTD_ERASE_DONE) {
212 printk(KERN_WARNING "Erase at 0x%08x finished, but state != MTD_ERASE_DONE. State is 0x%x instead.\n", instr->addr, instr->state); 212 printk(KERN_WARNING "Erase at 0x%08llx finished, but state != MTD_ERASE_DONE. State is 0x%x instead.\n",
213 (unsigned long long)instr->addr, instr->state);
213 jffs2_erase_failed(priv->c, priv->jeb, instr->fail_addr); 214 jffs2_erase_failed(priv->c, priv->jeb, instr->fail_addr);
214 } else { 215 } else {
215 jffs2_erase_succeeded(priv->c, priv->jeb); 216 jffs2_erase_succeeded(priv->c, priv->jeb);
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 5a98aa87c853..5edc2bf20581 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -132,7 +132,7 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
132 uint32_t pageofs = index << PAGE_CACHE_SHIFT; 132 uint32_t pageofs = index << PAGE_CACHE_SHIFT;
133 int ret = 0; 133 int ret = 0;
134 134
135 pg = __grab_cache_page(mapping, index); 135 pg = grab_cache_page_write_begin(mapping, index, flags);
136 if (!pg) 136 if (!pg)
137 return -ENOMEM; 137 return -ENOMEM;
138 *pagep = pg; 138 *pagep = pg;
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index 1750445556c3..507ed6ec1847 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -366,9 +366,6 @@ void jffs2_free_ino_caches(struct jffs2_sb_info *c);
366void jffs2_free_raw_node_refs(struct jffs2_sb_info *c); 366void jffs2_free_raw_node_refs(struct jffs2_sb_info *c);
367struct jffs2_node_frag *jffs2_lookup_node_frag(struct rb_root *fragtree, uint32_t offset); 367struct jffs2_node_frag *jffs2_lookup_node_frag(struct rb_root *fragtree, uint32_t offset);
368void jffs2_kill_fragtree(struct rb_root *root, struct jffs2_sb_info *c_delete); 368void jffs2_kill_fragtree(struct rb_root *root, struct jffs2_sb_info *c_delete);
369struct rb_node *rb_next(struct rb_node *);
370struct rb_node *rb_prev(struct rb_node *);
371void rb_replace_node(struct rb_node *victim, struct rb_node *new, struct rb_root *root);
372int jffs2_add_full_dnode_to_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, struct jffs2_full_dnode *fn); 369int jffs2_add_full_dnode_to_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, struct jffs2_full_dnode *fn);
373uint32_t jffs2_truncate_fragtree (struct jffs2_sb_info *c, struct rb_root *list, uint32_t size); 370uint32_t jffs2_truncate_fragtree (struct jffs2_sb_info *c, struct rb_root *list, uint32_t size);
374struct jffs2_raw_node_ref *jffs2_link_node_ref(struct jffs2_sb_info *c, 371struct jffs2_raw_node_ref *jffs2_link_node_ref(struct jffs2_sb_info *c,
diff --git a/fs/jfs/Kconfig b/fs/jfs/Kconfig
new file mode 100644
index 000000000000..9ff619a6f9cc
--- /dev/null
+++ b/fs/jfs/Kconfig
@@ -0,0 +1,49 @@
1config JFS_FS
2 tristate "JFS filesystem support"
3 select NLS
4 help
5 This is a port of IBM's Journaled Filesystem . More information is
6 available in the file <file:Documentation/filesystems/jfs.txt>.
7
8 If you do not intend to use the JFS filesystem, say N.
9
10config JFS_POSIX_ACL
11 bool "JFS POSIX Access Control Lists"
12 depends on JFS_FS
13 select FS_POSIX_ACL
14 help
15 Posix Access Control Lists (ACLs) support permissions for users and
16 groups beyond the owner/group/world scheme.
17
18 To learn more about Access Control Lists, visit the Posix ACLs for
19 Linux website <http://acl.bestbits.at/>.
20
21 If you don't know what Access Control Lists are, say N
22
23config JFS_SECURITY
24 bool "JFS Security Labels"
25 depends on JFS_FS
26 help
27 Security labels support alternative access control models
28 implemented by security modules like SELinux. This option
29 enables an extended attribute handler for file security
30 labels in the jfs filesystem.
31
32 If you are not using a security module that requires using
33 extended attributes for file security labels, say N.
34
35config JFS_DEBUG
36 bool "JFS debugging"
37 depends on JFS_FS
38 help
39 If you are experiencing any problems with the JFS filesystem, say
40 Y here. This will result in additional debugging messages to be
41 written to the system log. Under normal circumstances, this
42 results in very little overhead.
43
44config JFS_STATISTICS
45 bool "JFS statistics"
46 depends on JFS_FS
47 help
48 Enabling this option will cause statistics from the JFS file system
49 to be made available to the user in the /proc/fs/jfs/ directory.
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index d6363d8309d0..0f94381ca6d0 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -58,9 +58,9 @@
58 58
59/* 59/*
60 * __mark_inode_dirty expects inodes to be hashed. Since we don't want 60 * __mark_inode_dirty expects inodes to be hashed. Since we don't want
61 * special inodes in the fileset inode space, we hash them to a dummy head 61 * special inodes in the fileset inode space, we make them appear hashed,
62 * but do not put on any lists.
62 */ 63 */
63static HLIST_HEAD(aggregate_hash);
64 64
65/* 65/*
66 * imap locks 66 * imap locks
@@ -496,7 +496,11 @@ struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary)
496 /* release the page */ 496 /* release the page */
497 release_metapage(mp); 497 release_metapage(mp);
498 498
499 hlist_add_head(&ip->i_hash, &aggregate_hash); 499 /*
500 * that will look hashed, but won't be on any list; hlist_del()
501 * will work fine and require no locking.
502 */
503 ip->i_hash.pprev = &ip->i_hash.next;
500 504
501 return (ip); 505 return (ip);
502} 506}
diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c
index 70022fd1c539..d4d142c2edd4 100644
--- a/fs/jfs/jfs_inode.c
+++ b/fs/jfs/jfs_inode.c
@@ -79,7 +79,8 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
79 inode = new_inode(sb); 79 inode = new_inode(sb);
80 if (!inode) { 80 if (!inode) {
81 jfs_warn("ialloc: new_inode returned NULL!"); 81 jfs_warn("ialloc: new_inode returned NULL!");
82 return ERR_PTR(-ENOMEM); 82 rc = -ENOMEM;
83 goto fail;
83 } 84 }
84 85
85 jfs_inode = JFS_IP(inode); 86 jfs_inode = JFS_IP(inode);
@@ -89,8 +90,12 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
89 jfs_warn("ialloc: diAlloc returned %d!", rc); 90 jfs_warn("ialloc: diAlloc returned %d!", rc);
90 if (rc == -EIO) 91 if (rc == -EIO)
91 make_bad_inode(inode); 92 make_bad_inode(inode);
92 iput(inode); 93 goto fail_put;
93 return ERR_PTR(rc); 94 }
95
96 if (insert_inode_locked(inode) < 0) {
97 rc = -EINVAL;
98 goto fail_unlock;
94 } 99 }
95 100
96 inode->i_uid = current_fsuid(); 101 inode->i_uid = current_fsuid();
@@ -112,11 +117,8 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
112 * Allocate inode to quota. 117 * Allocate inode to quota.
113 */ 118 */
114 if (DQUOT_ALLOC_INODE(inode)) { 119 if (DQUOT_ALLOC_INODE(inode)) {
115 DQUOT_DROP(inode); 120 rc = -EDQUOT;
116 inode->i_flags |= S_NOQUOTA; 121 goto fail_drop;
117 inode->i_nlink = 0;
118 iput(inode);
119 return ERR_PTR(-EDQUOT);
120 } 122 }
121 123
122 inode->i_mode = mode; 124 inode->i_mode = mode;
@@ -158,4 +160,15 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
158 jfs_info("ialloc returns inode = 0x%p\n", inode); 160 jfs_info("ialloc returns inode = 0x%p\n", inode);
159 161
160 return inode; 162 return inode;
163
164fail_drop:
165 DQUOT_DROP(inode);
166 inode->i_flags |= S_NOQUOTA;
167fail_unlock:
168 inode->i_nlink = 0;
169 unlock_new_inode(inode);
170fail_put:
171 iput(inode);
172fail:
173 return ERR_PTR(rc);
161} 174}
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index cc3cedffbfa1..b4de56b851e4 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -155,7 +155,6 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode,
155 ip->i_fop = &jfs_file_operations; 155 ip->i_fop = &jfs_file_operations;
156 ip->i_mapping->a_ops = &jfs_aops; 156 ip->i_mapping->a_ops = &jfs_aops;
157 157
158 insert_inode_hash(ip);
159 mark_inode_dirty(ip); 158 mark_inode_dirty(ip);
160 159
161 dip->i_ctime = dip->i_mtime = CURRENT_TIME; 160 dip->i_ctime = dip->i_mtime = CURRENT_TIME;
@@ -171,9 +170,12 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode,
171 if (rc) { 170 if (rc) {
172 free_ea_wmap(ip); 171 free_ea_wmap(ip);
173 ip->i_nlink = 0; 172 ip->i_nlink = 0;
173 unlock_new_inode(ip);
174 iput(ip); 174 iput(ip);
175 } else 175 } else {
176 d_instantiate(dentry, ip); 176 d_instantiate(dentry, ip);
177 unlock_new_inode(ip);
178 }
177 179
178 out2: 180 out2:
179 free_UCSname(&dname); 181 free_UCSname(&dname);
@@ -289,7 +291,6 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode)
289 ip->i_op = &jfs_dir_inode_operations; 291 ip->i_op = &jfs_dir_inode_operations;
290 ip->i_fop = &jfs_dir_operations; 292 ip->i_fop = &jfs_dir_operations;
291 293
292 insert_inode_hash(ip);
293 mark_inode_dirty(ip); 294 mark_inode_dirty(ip);
294 295
295 /* update parent directory inode */ 296 /* update parent directory inode */
@@ -306,9 +307,12 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode)
306 if (rc) { 307 if (rc) {
307 free_ea_wmap(ip); 308 free_ea_wmap(ip);
308 ip->i_nlink = 0; 309 ip->i_nlink = 0;
310 unlock_new_inode(ip);
309 iput(ip); 311 iput(ip);
310 } else 312 } else {
311 d_instantiate(dentry, ip); 313 d_instantiate(dentry, ip);
314 unlock_new_inode(ip);
315 }
312 316
313 out2: 317 out2:
314 free_UCSname(&dname); 318 free_UCSname(&dname);
@@ -1019,7 +1023,6 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
1019 goto out3; 1023 goto out3;
1020 } 1024 }
1021 1025
1022 insert_inode_hash(ip);
1023 mark_inode_dirty(ip); 1026 mark_inode_dirty(ip);
1024 1027
1025 dip->i_ctime = dip->i_mtime = CURRENT_TIME; 1028 dip->i_ctime = dip->i_mtime = CURRENT_TIME;
@@ -1039,9 +1042,12 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
1039 if (rc) { 1042 if (rc) {
1040 free_ea_wmap(ip); 1043 free_ea_wmap(ip);
1041 ip->i_nlink = 0; 1044 ip->i_nlink = 0;
1045 unlock_new_inode(ip);
1042 iput(ip); 1046 iput(ip);
1043 } else 1047 } else {
1044 d_instantiate(dentry, ip); 1048 d_instantiate(dentry, ip);
1049 unlock_new_inode(ip);
1050 }
1045 1051
1046 out2: 1052 out2:
1047 free_UCSname(&dname); 1053 free_UCSname(&dname);
@@ -1399,7 +1405,6 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry,
1399 jfs_ip->dev = new_encode_dev(rdev); 1405 jfs_ip->dev = new_encode_dev(rdev);
1400 init_special_inode(ip, ip->i_mode, rdev); 1406 init_special_inode(ip, ip->i_mode, rdev);
1401 1407
1402 insert_inode_hash(ip);
1403 mark_inode_dirty(ip); 1408 mark_inode_dirty(ip);
1404 1409
1405 dir->i_ctime = dir->i_mtime = CURRENT_TIME; 1410 dir->i_ctime = dir->i_mtime = CURRENT_TIME;
@@ -1417,9 +1422,12 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry,
1417 if (rc) { 1422 if (rc) {
1418 free_ea_wmap(ip); 1423 free_ea_wmap(ip);
1419 ip->i_nlink = 0; 1424 ip->i_nlink = 0;
1425 unlock_new_inode(ip);
1420 iput(ip); 1426 iput(ip);
1421 } else 1427 } else {
1422 d_instantiate(dentry, ip); 1428 d_instantiate(dentry, ip);
1429 unlock_new_inode(ip);
1430 }
1423 1431
1424 out1: 1432 out1:
1425 free_UCSname(&dname); 1433 free_UCSname(&dname);
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 0dae345e481b..b37d1f78b854 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -543,7 +543,7 @@ out_kfree:
543 return ret; 543 return ret;
544} 544}
545 545
546static void jfs_write_super_lockfs(struct super_block *sb) 546static int jfs_freeze(struct super_block *sb)
547{ 547{
548 struct jfs_sb_info *sbi = JFS_SBI(sb); 548 struct jfs_sb_info *sbi = JFS_SBI(sb);
549 struct jfs_log *log = sbi->log; 549 struct jfs_log *log = sbi->log;
@@ -553,9 +553,10 @@ static void jfs_write_super_lockfs(struct super_block *sb)
553 lmLogShutdown(log); 553 lmLogShutdown(log);
554 updateSuper(sb, FM_CLEAN); 554 updateSuper(sb, FM_CLEAN);
555 } 555 }
556 return 0;
556} 557}
557 558
558static void jfs_unlockfs(struct super_block *sb) 559static int jfs_unfreeze(struct super_block *sb)
559{ 560{
560 struct jfs_sb_info *sbi = JFS_SBI(sb); 561 struct jfs_sb_info *sbi = JFS_SBI(sb);
561 struct jfs_log *log = sbi->log; 562 struct jfs_log *log = sbi->log;
@@ -568,6 +569,7 @@ static void jfs_unlockfs(struct super_block *sb)
568 else 569 else
569 txResume(sb); 570 txResume(sb);
570 } 571 }
572 return 0;
571} 573}
572 574
573static int jfs_get_sb(struct file_system_type *fs_type, 575static int jfs_get_sb(struct file_system_type *fs_type,
@@ -735,8 +737,8 @@ static const struct super_operations jfs_super_operations = {
735 .delete_inode = jfs_delete_inode, 737 .delete_inode = jfs_delete_inode,
736 .put_super = jfs_put_super, 738 .put_super = jfs_put_super,
737 .sync_fs = jfs_sync_fs, 739 .sync_fs = jfs_sync_fs,
738 .write_super_lockfs = jfs_write_super_lockfs, 740 .freeze_fs = jfs_freeze,
739 .unlockfs = jfs_unlockfs, 741 .unfreeze_fs = jfs_unfreeze,
740 .statfs = jfs_statfs, 742 .statfs = jfs_statfs,
741 .remount_fs = jfs_remount, 743 .remount_fs = jfs_remount,
742 .show_options = jfs_show_options, 744 .show_options = jfs_show_options,
diff --git a/fs/libfs.c b/fs/libfs.c
index e960a8321902..49b44099dabb 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -231,7 +231,6 @@ int get_sb_pseudo(struct file_system_type *fs_type, char *name,
231 */ 231 */
232 root->i_ino = 1; 232 root->i_ino = 1;
233 root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR; 233 root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR;
234 root->i_uid = root->i_gid = 0;
235 root->i_atime = root->i_mtime = root->i_ctime = CURRENT_TIME; 234 root->i_atime = root->i_mtime = root->i_ctime = CURRENT_TIME;
236 dentry = d_alloc(NULL, &d_name); 235 dentry = d_alloc(NULL, &d_name);
237 if (!dentry) { 236 if (!dentry) {
@@ -360,7 +359,7 @@ int simple_write_begin(struct file *file, struct address_space *mapping,
360 index = pos >> PAGE_CACHE_SHIFT; 359 index = pos >> PAGE_CACHE_SHIFT;
361 from = pos & (PAGE_CACHE_SIZE - 1); 360 from = pos & (PAGE_CACHE_SIZE - 1);
362 361
363 page = __grab_cache_page(mapping, index); 362 page = grab_cache_page_write_begin(mapping, index, flags);
364 if (!page) 363 if (!page)
365 return -ENOMEM; 364 return -ENOMEM;
366 365
@@ -436,8 +435,6 @@ int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files
436 */ 435 */
437 inode->i_ino = 1; 436 inode->i_ino = 1;
438 inode->i_mode = S_IFDIR | 0755; 437 inode->i_mode = S_IFDIR | 0755;
439 inode->i_uid = inode->i_gid = 0;
440 inode->i_blocks = 0;
441 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 438 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
442 inode->i_op = &simple_dir_inode_operations; 439 inode->i_op = &simple_dir_inode_operations;
443 inode->i_fop = &simple_dir_operations; 440 inode->i_fop = &simple_dir_operations;
@@ -464,8 +461,6 @@ int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files
464 if (!inode) 461 if (!inode)
465 goto out; 462 goto out;
466 inode->i_mode = S_IFREG | files->mode; 463 inode->i_mode = S_IFREG | files->mode;
467 inode->i_uid = inode->i_gid = 0;
468 inode->i_blocks = 0;
469 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 464 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
470 inode->i_fop = files->ops; 465 inode->i_fop = files->ops;
471 inode->i_ino = i; 466 inode->i_ino = i;
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 31668b690e03..dd7957064a8c 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -16,7 +16,6 @@
16#include <linux/sunrpc/clnt.h> 16#include <linux/sunrpc/clnt.h>
17#include <linux/sunrpc/svc.h> 17#include <linux/sunrpc/svc.h>
18#include <linux/lockd/lockd.h> 18#include <linux/lockd/lockd.h>
19#include <linux/lockd/sm_inter.h>
20 19
21#define NLMDBG_FACILITY NLMDBG_CLIENT 20#define NLMDBG_FACILITY NLMDBG_CLIENT
22#define NLMCLNT_GRACE_WAIT (5*HZ) 21#define NLMCLNT_GRACE_WAIT (5*HZ)
@@ -518,11 +517,9 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl)
518 unsigned char fl_type; 517 unsigned char fl_type;
519 int status = -ENOLCK; 518 int status = -ENOLCK;
520 519
521 if (nsm_monitor(host) < 0) { 520 if (nsm_monitor(host) < 0)
522 printk(KERN_NOTICE "lockd: failed to monitor %s\n",
523 host->h_name);
524 goto out; 521 goto out;
525 } 522
526 fl->fl_flags |= FL_ACCESS; 523 fl->fl_flags |= FL_ACCESS;
527 status = do_vfs_lock(fl); 524 status = do_vfs_lock(fl);
528 fl->fl_flags = fl_flags; 525 fl->fl_flags = fl_flags;
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index abdebf76b820..99d737bd4325 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -15,7 +15,6 @@
15#include <linux/sunrpc/clnt.h> 15#include <linux/sunrpc/clnt.h>
16#include <linux/sunrpc/svc.h> 16#include <linux/sunrpc/svc.h>
17#include <linux/lockd/lockd.h> 17#include <linux/lockd/lockd.h>
18#include <linux/lockd/sm_inter.h>
19#include <linux/mutex.h> 18#include <linux/mutex.h>
20 19
21#include <net/ipv6.h> 20#include <net/ipv6.h>
@@ -32,11 +31,6 @@ static int nrhosts;
32static DEFINE_MUTEX(nlm_host_mutex); 31static DEFINE_MUTEX(nlm_host_mutex);
33 32
34static void nlm_gc_hosts(void); 33static void nlm_gc_hosts(void);
35static struct nsm_handle *nsm_find(const struct sockaddr *sap,
36 const size_t salen,
37 const char *hostname,
38 const size_t hostname_len,
39 const int create);
40 34
41struct nlm_lookup_host_info { 35struct nlm_lookup_host_info {
42 const int server; /* search for server|client */ 36 const int server; /* search for server|client */
@@ -105,32 +99,6 @@ static void nlm_clear_port(struct sockaddr *sap)
105 } 99 }
106} 100}
107 101
108static void nlm_display_address(const struct sockaddr *sap,
109 char *buf, const size_t len)
110{
111 const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
112 const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
113
114 switch (sap->sa_family) {
115 case AF_UNSPEC:
116 snprintf(buf, len, "unspecified");
117 break;
118 case AF_INET:
119 snprintf(buf, len, "%pI4", &sin->sin_addr.s_addr);
120 break;
121 case AF_INET6:
122 if (ipv6_addr_v4mapped(&sin6->sin6_addr))
123 snprintf(buf, len, "%pI4",
124 &sin6->sin6_addr.s6_addr32[3]);
125 else
126 snprintf(buf, len, "%pI6", &sin6->sin6_addr);
127 break;
128 default:
129 snprintf(buf, len, "unsupported address family");
130 break;
131 }
132}
133
134/* 102/*
135 * Common host lookup routine for server & client 103 * Common host lookup routine for server & client
136 */ 104 */
@@ -190,8 +158,8 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
190 atomic_inc(&nsm->sm_count); 158 atomic_inc(&nsm->sm_count);
191 else { 159 else {
192 host = NULL; 160 host = NULL;
193 nsm = nsm_find(ni->sap, ni->salen, 161 nsm = nsm_get_handle(ni->sap, ni->salen,
194 ni->hostname, ni->hostname_len, 1); 162 ni->hostname, ni->hostname_len);
195 if (!nsm) { 163 if (!nsm) {
196 dprintk("lockd: nlm_lookup_host failed; " 164 dprintk("lockd: nlm_lookup_host failed; "
197 "no nsm handle\n"); 165 "no nsm handle\n");
@@ -206,6 +174,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
206 goto out; 174 goto out;
207 } 175 }
208 host->h_name = nsm->sm_name; 176 host->h_name = nsm->sm_name;
177 host->h_addrbuf = nsm->sm_addrbuf;
209 memcpy(nlm_addr(host), ni->sap, ni->salen); 178 memcpy(nlm_addr(host), ni->sap, ni->salen);
210 host->h_addrlen = ni->salen; 179 host->h_addrlen = ni->salen;
211 nlm_clear_port(nlm_addr(host)); 180 nlm_clear_port(nlm_addr(host));
@@ -232,11 +201,6 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
232 201
233 nrhosts++; 202 nrhosts++;
234 203
235 nlm_display_address((struct sockaddr *)&host->h_addr,
236 host->h_addrbuf, sizeof(host->h_addrbuf));
237 nlm_display_address((struct sockaddr *)&host->h_srcaddr,
238 host->h_srcaddrbuf, sizeof(host->h_srcaddrbuf));
239
240 dprintk("lockd: nlm_lookup_host created host %s\n", 204 dprintk("lockd: nlm_lookup_host created host %s\n",
241 host->h_name); 205 host->h_name);
242 206
@@ -256,10 +220,8 @@ nlm_destroy_host(struct nlm_host *host)
256 BUG_ON(!list_empty(&host->h_lockowners)); 220 BUG_ON(!list_empty(&host->h_lockowners));
257 BUG_ON(atomic_read(&host->h_count)); 221 BUG_ON(atomic_read(&host->h_count));
258 222
259 /*
260 * Release NSM handle and unmonitor host.
261 */
262 nsm_unmonitor(host); 223 nsm_unmonitor(host);
224 nsm_release(host->h_nsmhandle);
263 225
264 clnt = host->h_rpcclnt; 226 clnt = host->h_rpcclnt;
265 if (clnt != NULL) 227 if (clnt != NULL)
@@ -378,8 +340,8 @@ nlm_bind_host(struct nlm_host *host)
378{ 340{
379 struct rpc_clnt *clnt; 341 struct rpc_clnt *clnt;
380 342
381 dprintk("lockd: nlm_bind_host %s (%s), my addr=%s\n", 343 dprintk("lockd: nlm_bind_host %s (%s)\n",
382 host->h_name, host->h_addrbuf, host->h_srcaddrbuf); 344 host->h_name, host->h_addrbuf);
383 345
384 /* Lock host handle */ 346 /* Lock host handle */
385 mutex_lock(&host->h_mutex); 347 mutex_lock(&host->h_mutex);
@@ -481,35 +443,23 @@ void nlm_release_host(struct nlm_host *host)
481 } 443 }
482} 444}
483 445
484/* 446/**
485 * We were notified that the host indicated by address &sin 447 * nlm_host_rebooted - Release all resources held by rebooted host
486 * has rebooted. 448 * @info: pointer to decoded results of NLM_SM_NOTIFY call
487 * Release all resources held by that peer. 449 *
450 * We were notified that the specified host has rebooted. Release
451 * all resources held by that peer.
488 */ 452 */
489void nlm_host_rebooted(const struct sockaddr_in *sin, 453void nlm_host_rebooted(const struct nlm_reboot *info)
490 const char *hostname,
491 unsigned int hostname_len,
492 u32 new_state)
493{ 454{
494 struct hlist_head *chain; 455 struct hlist_head *chain;
495 struct hlist_node *pos; 456 struct hlist_node *pos;
496 struct nsm_handle *nsm; 457 struct nsm_handle *nsm;
497 struct nlm_host *host; 458 struct nlm_host *host;
498 459
499 nsm = nsm_find((struct sockaddr *)sin, sizeof(*sin), 460 nsm = nsm_reboot_lookup(info);
500 hostname, hostname_len, 0); 461 if (unlikely(nsm == NULL))
501 if (nsm == NULL) {
502 dprintk("lockd: never saw rebooted peer '%.*s' before\n",
503 hostname_len, hostname);
504 return; 462 return;
505 }
506
507 dprintk("lockd: nlm_host_rebooted(%.*s, %s)\n",
508 hostname_len, hostname, nsm->sm_addrbuf);
509
510 /* When reclaiming locks on this peer, make sure that
511 * we set up a new notification */
512 nsm->sm_monitored = 0;
513 463
514 /* Mark all hosts tied to this NSM state as having rebooted. 464 /* Mark all hosts tied to this NSM state as having rebooted.
515 * We run the loop repeatedly, because we drop the host table 465 * We run the loop repeatedly, because we drop the host table
@@ -520,8 +470,8 @@ again: mutex_lock(&nlm_host_mutex);
520 for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) { 470 for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
521 hlist_for_each_entry(host, pos, chain, h_hash) { 471 hlist_for_each_entry(host, pos, chain, h_hash) {
522 if (host->h_nsmhandle == nsm 472 if (host->h_nsmhandle == nsm
523 && host->h_nsmstate != new_state) { 473 && host->h_nsmstate != info->state) {
524 host->h_nsmstate = new_state; 474 host->h_nsmstate = info->state;
525 host->h_state++; 475 host->h_state++;
526 476
527 nlm_get_host(host); 477 nlm_get_host(host);
@@ -629,89 +579,3 @@ nlm_gc_hosts(void)
629 579
630 next_gc = jiffies + NLM_HOST_COLLECT; 580 next_gc = jiffies + NLM_HOST_COLLECT;
631} 581}
632
633
634/*
635 * Manage NSM handles
636 */
637static LIST_HEAD(nsm_handles);
638static DEFINE_SPINLOCK(nsm_lock);
639
640static struct nsm_handle *nsm_find(const struct sockaddr *sap,
641 const size_t salen,
642 const char *hostname,
643 const size_t hostname_len,
644 const int create)
645{
646 struct nsm_handle *nsm = NULL;
647 struct nsm_handle *pos;
648
649 if (!sap)
650 return NULL;
651
652 if (hostname && memchr(hostname, '/', hostname_len) != NULL) {
653 if (printk_ratelimit()) {
654 printk(KERN_WARNING "Invalid hostname \"%.*s\" "
655 "in NFS lock request\n",
656 (int)hostname_len, hostname);
657 }
658 return NULL;
659 }
660
661retry:
662 spin_lock(&nsm_lock);
663 list_for_each_entry(pos, &nsm_handles, sm_link) {
664
665 if (hostname && nsm_use_hostnames) {
666 if (strlen(pos->sm_name) != hostname_len
667 || memcmp(pos->sm_name, hostname, hostname_len))
668 continue;
669 } else if (!nlm_cmp_addr(nsm_addr(pos), sap))
670 continue;
671 atomic_inc(&pos->sm_count);
672 kfree(nsm);
673 nsm = pos;
674 goto found;
675 }
676 if (nsm) {
677 list_add(&nsm->sm_link, &nsm_handles);
678 goto found;
679 }
680 spin_unlock(&nsm_lock);
681
682 if (!create)
683 return NULL;
684
685 nsm = kzalloc(sizeof(*nsm) + hostname_len + 1, GFP_KERNEL);
686 if (nsm == NULL)
687 return NULL;
688
689 memcpy(nsm_addr(nsm), sap, salen);
690 nsm->sm_addrlen = salen;
691 nsm->sm_name = (char *) (nsm + 1);
692 memcpy(nsm->sm_name, hostname, hostname_len);
693 nsm->sm_name[hostname_len] = '\0';
694 nlm_display_address((struct sockaddr *)&nsm->sm_addr,
695 nsm->sm_addrbuf, sizeof(nsm->sm_addrbuf));
696 atomic_set(&nsm->sm_count, 1);
697 goto retry;
698
699found:
700 spin_unlock(&nsm_lock);
701 return nsm;
702}
703
704/*
705 * Release an NSM handle
706 */
707void
708nsm_release(struct nsm_handle *nsm)
709{
710 if (!nsm)
711 return;
712 if (atomic_dec_and_lock(&nsm->sm_count, &nsm_lock)) {
713 list_del(&nsm->sm_link);
714 spin_unlock(&nsm_lock);
715 kfree(nsm);
716 }
717}
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index ffd3461f75ef..5e2c4d5ac827 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -9,35 +9,123 @@
9#include <linux/types.h> 9#include <linux/types.h>
10#include <linux/utsname.h> 10#include <linux/utsname.h>
11#include <linux/kernel.h> 11#include <linux/kernel.h>
12#include <linux/ktime.h>
13
12#include <linux/sunrpc/clnt.h> 14#include <linux/sunrpc/clnt.h>
13#include <linux/sunrpc/xprtsock.h> 15#include <linux/sunrpc/xprtsock.h>
14#include <linux/sunrpc/svc.h> 16#include <linux/sunrpc/svc.h>
15#include <linux/lockd/lockd.h> 17#include <linux/lockd/lockd.h>
16#include <linux/lockd/sm_inter.h>
17
18 18
19#define NLMDBG_FACILITY NLMDBG_MONITOR 19#define NLMDBG_FACILITY NLMDBG_MONITOR
20#define NSM_PROGRAM 100024
21#define NSM_VERSION 1
22
23enum {
24 NSMPROC_NULL,
25 NSMPROC_STAT,
26 NSMPROC_MON,
27 NSMPROC_UNMON,
28 NSMPROC_UNMON_ALL,
29 NSMPROC_SIMU_CRASH,
30 NSMPROC_NOTIFY,
31};
32
33struct nsm_args {
34 struct nsm_private *priv;
35 u32 prog; /* RPC callback info */
36 u32 vers;
37 u32 proc;
20 38
21#define XDR_ADDRBUF_LEN (20) 39 char *mon_name;
40};
22 41
23static struct rpc_clnt * nsm_create(void); 42struct nsm_res {
43 u32 status;
44 u32 state;
45};
24 46
25static struct rpc_program nsm_program; 47static struct rpc_program nsm_program;
48static LIST_HEAD(nsm_handles);
49static DEFINE_SPINLOCK(nsm_lock);
26 50
27/* 51/*
28 * Local NSM state 52 * Local NSM state
29 */ 53 */
30int nsm_local_state; 54int __read_mostly nsm_local_state;
55int __read_mostly nsm_use_hostnames;
31 56
32/* 57static inline struct sockaddr *nsm_addr(const struct nsm_handle *nsm)
33 * Common procedure for SM_MON/SM_UNMON calls 58{
34 */ 59 return (struct sockaddr *)&nsm->sm_addr;
35static int 60}
36nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res) 61
62static void nsm_display_ipv4_address(const struct sockaddr *sap, char *buf,
63 const size_t len)
64{
65 const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
66 snprintf(buf, len, "%pI4", &sin->sin_addr.s_addr);
67}
68
69static void nsm_display_ipv6_address(const struct sockaddr *sap, char *buf,
70 const size_t len)
71{
72 const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
73
74 if (ipv6_addr_v4mapped(&sin6->sin6_addr))
75 snprintf(buf, len, "%pI4", &sin6->sin6_addr.s6_addr32[3]);
76 else if (sin6->sin6_scope_id != 0)
77 snprintf(buf, len, "%pI6%%%u", &sin6->sin6_addr,
78 sin6->sin6_scope_id);
79 else
80 snprintf(buf, len, "%pI6", &sin6->sin6_addr);
81}
82
83static void nsm_display_address(const struct sockaddr *sap,
84 char *buf, const size_t len)
85{
86 switch (sap->sa_family) {
87 case AF_INET:
88 nsm_display_ipv4_address(sap, buf, len);
89 break;
90 case AF_INET6:
91 nsm_display_ipv6_address(sap, buf, len);
92 break;
93 default:
94 snprintf(buf, len, "unsupported address family");
95 break;
96 }
97}
98
99static struct rpc_clnt *nsm_create(void)
100{
101 struct sockaddr_in sin = {
102 .sin_family = AF_INET,
103 .sin_addr.s_addr = htonl(INADDR_LOOPBACK),
104 };
105 struct rpc_create_args args = {
106 .protocol = XPRT_TRANSPORT_UDP,
107 .address = (struct sockaddr *)&sin,
108 .addrsize = sizeof(sin),
109 .servername = "rpc.statd",
110 .program = &nsm_program,
111 .version = NSM_VERSION,
112 .authflavor = RPC_AUTH_NULL,
113 };
114
115 return rpc_create(&args);
116}
117
118static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res)
37{ 119{
38 struct rpc_clnt *clnt; 120 struct rpc_clnt *clnt;
39 int status; 121 int status;
40 struct nsm_args args; 122 struct nsm_args args = {
123 .priv = &nsm->sm_priv,
124 .prog = NLM_PROGRAM,
125 .vers = 3,
126 .proc = NLMPROC_NSM_NOTIFY,
127 .mon_name = nsm->sm_mon_name,
128 };
41 struct rpc_message msg = { 129 struct rpc_message msg = {
42 .rpc_argp = &args, 130 .rpc_argp = &args,
43 .rpc_resp = res, 131 .rpc_resp = res,
@@ -46,22 +134,18 @@ nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res)
46 clnt = nsm_create(); 134 clnt = nsm_create();
47 if (IS_ERR(clnt)) { 135 if (IS_ERR(clnt)) {
48 status = PTR_ERR(clnt); 136 status = PTR_ERR(clnt);
137 dprintk("lockd: failed to create NSM upcall transport, "
138 "status=%d\n", status);
49 goto out; 139 goto out;
50 } 140 }
51 141
52 memset(&args, 0, sizeof(args));
53 args.mon_name = nsm->sm_name;
54 args.addr = nsm_addr_in(nsm)->sin_addr.s_addr;
55 args.prog = NLM_PROGRAM;
56 args.vers = 3;
57 args.proc = NLMPROC_NSM_NOTIFY;
58 memset(res, 0, sizeof(*res)); 142 memset(res, 0, sizeof(*res));
59 143
60 msg.rpc_proc = &clnt->cl_procinfo[proc]; 144 msg.rpc_proc = &clnt->cl_procinfo[proc];
61 status = rpc_call_sync(clnt, &msg, 0); 145 status = rpc_call_sync(clnt, &msg, 0);
62 if (status < 0) 146 if (status < 0)
63 printk(KERN_DEBUG "nsm_mon_unmon: rpc failed, status=%d\n", 147 dprintk("lockd: NSM upcall RPC failed, status=%d\n",
64 status); 148 status);
65 else 149 else
66 status = 0; 150 status = 0;
67 rpc_shutdown_client(clnt); 151 rpc_shutdown_client(clnt);
@@ -69,82 +153,272 @@ nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res)
69 return status; 153 return status;
70} 154}
71 155
72/* 156/**
73 * Set up monitoring of a remote host 157 * nsm_monitor - Notify a peer in case we reboot
158 * @host: pointer to nlm_host of peer to notify
159 *
160 * If this peer is not already monitored, this function sends an
161 * upcall to the local rpc.statd to record the name/address of
162 * the peer to notify in case we reboot.
163 *
164 * Returns zero if the peer is monitored by the local rpc.statd;
165 * otherwise a negative errno value is returned.
74 */ 166 */
75int 167int nsm_monitor(const struct nlm_host *host)
76nsm_monitor(struct nlm_host *host)
77{ 168{
78 struct nsm_handle *nsm = host->h_nsmhandle; 169 struct nsm_handle *nsm = host->h_nsmhandle;
79 struct nsm_res res; 170 struct nsm_res res;
80 int status; 171 int status;
81 172
82 dprintk("lockd: nsm_monitor(%s)\n", host->h_name); 173 dprintk("lockd: nsm_monitor(%s)\n", nsm->sm_name);
83 BUG_ON(nsm == NULL);
84 174
85 if (nsm->sm_monitored) 175 if (nsm->sm_monitored)
86 return 0; 176 return 0;
87 177
88 status = nsm_mon_unmon(nsm, SM_MON, &res); 178 /*
179 * Choose whether to record the caller_name or IP address of
180 * this peer in the local rpc.statd's database.
181 */
182 nsm->sm_mon_name = nsm_use_hostnames ? nsm->sm_name : nsm->sm_addrbuf;
89 183
90 if (status < 0 || res.status != 0) 184 status = nsm_mon_unmon(nsm, NSMPROC_MON, &res);
91 printk(KERN_NOTICE "lockd: cannot monitor %s\n", host->h_name); 185 if (res.status != 0)
186 status = -EIO;
187 if (status < 0)
188 printk(KERN_NOTICE "lockd: cannot monitor %s\n", nsm->sm_name);
92 else 189 else
93 nsm->sm_monitored = 1; 190 nsm->sm_monitored = 1;
94 return status; 191 return status;
95} 192}
96 193
97/* 194/**
98 * Cease to monitor remote host 195 * nsm_unmonitor - Unregister peer notification
196 * @host: pointer to nlm_host of peer to stop monitoring
197 *
198 * If this peer is monitored, this function sends an upcall to
199 * tell the local rpc.statd not to send this peer a notification
200 * when we reboot.
99 */ 201 */
100int 202void nsm_unmonitor(const struct nlm_host *host)
101nsm_unmonitor(struct nlm_host *host)
102{ 203{
103 struct nsm_handle *nsm = host->h_nsmhandle; 204 struct nsm_handle *nsm = host->h_nsmhandle;
104 struct nsm_res res; 205 struct nsm_res res;
105 int status = 0; 206 int status;
106
107 if (nsm == NULL)
108 return 0;
109 host->h_nsmhandle = NULL;
110 207
111 if (atomic_read(&nsm->sm_count) == 1 208 if (atomic_read(&nsm->sm_count) == 1
112 && nsm->sm_monitored && !nsm->sm_sticky) { 209 && nsm->sm_monitored && !nsm->sm_sticky) {
113 dprintk("lockd: nsm_unmonitor(%s)\n", host->h_name); 210 dprintk("lockd: nsm_unmonitor(%s)\n", nsm->sm_name);
114 211
115 status = nsm_mon_unmon(nsm, SM_UNMON, &res); 212 status = nsm_mon_unmon(nsm, NSMPROC_UNMON, &res);
213 if (res.status != 0)
214 status = -EIO;
116 if (status < 0) 215 if (status < 0)
117 printk(KERN_NOTICE "lockd: cannot unmonitor %s\n", 216 printk(KERN_NOTICE "lockd: cannot unmonitor %s\n",
118 host->h_name); 217 nsm->sm_name);
119 else 218 else
120 nsm->sm_monitored = 0; 219 nsm->sm_monitored = 0;
121 } 220 }
122 nsm_release(nsm); 221}
123 return status; 222
223static struct nsm_handle *nsm_lookup_hostname(const char *hostname,
224 const size_t len)
225{
226 struct nsm_handle *nsm;
227
228 list_for_each_entry(nsm, &nsm_handles, sm_link)
229 if (strlen(nsm->sm_name) == len &&
230 memcmp(nsm->sm_name, hostname, len) == 0)
231 return nsm;
232 return NULL;
233}
234
235static struct nsm_handle *nsm_lookup_addr(const struct sockaddr *sap)
236{
237 struct nsm_handle *nsm;
238
239 list_for_each_entry(nsm, &nsm_handles, sm_link)
240 if (nlm_cmp_addr(nsm_addr(nsm), sap))
241 return nsm;
242 return NULL;
243}
244
245static struct nsm_handle *nsm_lookup_priv(const struct nsm_private *priv)
246{
247 struct nsm_handle *nsm;
248
249 list_for_each_entry(nsm, &nsm_handles, sm_link)
250 if (memcmp(nsm->sm_priv.data, priv->data,
251 sizeof(priv->data)) == 0)
252 return nsm;
253 return NULL;
124} 254}
125 255
126/* 256/*
127 * Create NSM client for the local host 257 * Construct a unique cookie to match this nsm_handle to this monitored
258 * host. It is passed to the local rpc.statd via NSMPROC_MON, and
259 * returned via NLMPROC_SM_NOTIFY, in the "priv" field of these
260 * requests.
261 *
262 * The NSM protocol requires that these cookies be unique while the
263 * system is running. We prefer a stronger requirement of making them
264 * unique across reboots. If user space bugs cause a stale cookie to
265 * be sent to the kernel, it could cause the wrong host to lose its
266 * lock state if cookies were not unique across reboots.
267 *
268 * The cookies are exposed only to local user space via loopback. They
269 * do not appear on the physical network. If we want greater security
270 * for some reason, nsm_init_private() could perform a one-way hash to
271 * obscure the contents of the cookie.
128 */ 272 */
129static struct rpc_clnt * 273static void nsm_init_private(struct nsm_handle *nsm)
130nsm_create(void)
131{ 274{
132 struct sockaddr_in sin = { 275 u64 *p = (u64 *)&nsm->sm_priv.data;
133 .sin_family = AF_INET, 276 struct timespec ts;
134 .sin_addr.s_addr = htonl(INADDR_LOOPBACK),
135 .sin_port = 0,
136 };
137 struct rpc_create_args args = {
138 .protocol = XPRT_TRANSPORT_UDP,
139 .address = (struct sockaddr *)&sin,
140 .addrsize = sizeof(sin),
141 .servername = "localhost",
142 .program = &nsm_program,
143 .version = SM_VERSION,
144 .authflavor = RPC_AUTH_NULL,
145 };
146 277
147 return rpc_create(&args); 278 ktime_get_ts(&ts);
279 *p++ = timespec_to_ns(&ts);
280 *p = (unsigned long)nsm;
281}
282
283static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap,
284 const size_t salen,
285 const char *hostname,
286 const size_t hostname_len)
287{
288 struct nsm_handle *new;
289
290 new = kzalloc(sizeof(*new) + hostname_len + 1, GFP_KERNEL);
291 if (unlikely(new == NULL))
292 return NULL;
293
294 atomic_set(&new->sm_count, 1);
295 new->sm_name = (char *)(new + 1);
296 memcpy(nsm_addr(new), sap, salen);
297 new->sm_addrlen = salen;
298 nsm_init_private(new);
299 nsm_display_address((const struct sockaddr *)&new->sm_addr,
300 new->sm_addrbuf, sizeof(new->sm_addrbuf));
301 memcpy(new->sm_name, hostname, hostname_len);
302 new->sm_name[hostname_len] = '\0';
303
304 return new;
305}
306
307/**
308 * nsm_get_handle - Find or create a cached nsm_handle
309 * @sap: pointer to socket address of handle to find
310 * @salen: length of socket address
311 * @hostname: pointer to C string containing hostname to find
312 * @hostname_len: length of C string
313 *
314 * Behavior is modulated by the global nsm_use_hostnames variable.
315 *
316 * Returns a cached nsm_handle after bumping its ref count, or
317 * returns a fresh nsm_handle if a handle that matches @sap and/or
318 * @hostname cannot be found in the handle cache. Returns NULL if
319 * an error occurs.
320 */
321struct nsm_handle *nsm_get_handle(const struct sockaddr *sap,
322 const size_t salen, const char *hostname,
323 const size_t hostname_len)
324{
325 struct nsm_handle *cached, *new = NULL;
326
327 if (hostname && memchr(hostname, '/', hostname_len) != NULL) {
328 if (printk_ratelimit()) {
329 printk(KERN_WARNING "Invalid hostname \"%.*s\" "
330 "in NFS lock request\n",
331 (int)hostname_len, hostname);
332 }
333 return NULL;
334 }
335
336retry:
337 spin_lock(&nsm_lock);
338
339 if (nsm_use_hostnames && hostname != NULL)
340 cached = nsm_lookup_hostname(hostname, hostname_len);
341 else
342 cached = nsm_lookup_addr(sap);
343
344 if (cached != NULL) {
345 atomic_inc(&cached->sm_count);
346 spin_unlock(&nsm_lock);
347 kfree(new);
348 dprintk("lockd: found nsm_handle for %s (%s), "
349 "cnt %d\n", cached->sm_name,
350 cached->sm_addrbuf,
351 atomic_read(&cached->sm_count));
352 return cached;
353 }
354
355 if (new != NULL) {
356 list_add(&new->sm_link, &nsm_handles);
357 spin_unlock(&nsm_lock);
358 dprintk("lockd: created nsm_handle for %s (%s)\n",
359 new->sm_name, new->sm_addrbuf);
360 return new;
361 }
362
363 spin_unlock(&nsm_lock);
364
365 new = nsm_create_handle(sap, salen, hostname, hostname_len);
366 if (unlikely(new == NULL))
367 return NULL;
368 goto retry;
369}
370
371/**
372 * nsm_reboot_lookup - match NLMPROC_SM_NOTIFY arguments to an nsm_handle
373 * @info: pointer to NLMPROC_SM_NOTIFY arguments
374 *
375 * Returns a matching nsm_handle if found in the nsm cache; the returned
376 * nsm_handle's reference count is bumped and sm_monitored is cleared.
377 * Otherwise returns NULL if some error occurred.
378 */
379struct nsm_handle *nsm_reboot_lookup(const struct nlm_reboot *info)
380{
381 struct nsm_handle *cached;
382
383 spin_lock(&nsm_lock);
384
385 cached = nsm_lookup_priv(&info->priv);
386 if (unlikely(cached == NULL)) {
387 spin_unlock(&nsm_lock);
388 dprintk("lockd: never saw rebooted peer '%.*s' before\n",
389 info->len, info->mon);
390 return cached;
391 }
392
393 atomic_inc(&cached->sm_count);
394 spin_unlock(&nsm_lock);
395
396 /*
397 * During subsequent lock activity, force a fresh
398 * notification to be set up for this host.
399 */
400 cached->sm_monitored = 0;
401
402 dprintk("lockd: host %s (%s) rebooted, cnt %d\n",
403 cached->sm_name, cached->sm_addrbuf,
404 atomic_read(&cached->sm_count));
405 return cached;
406}
407
408/**
409 * nsm_release - Release an NSM handle
410 * @nsm: pointer to handle to be released
411 *
412 */
413void nsm_release(struct nsm_handle *nsm)
414{
415 if (atomic_dec_and_lock(&nsm->sm_count, &nsm_lock)) {
416 list_del(&nsm->sm_link);
417 spin_unlock(&nsm_lock);
418 dprintk("lockd: destroyed nsm_handle for %s (%s)\n",
419 nsm->sm_name, nsm->sm_addrbuf);
420 kfree(nsm);
421 }
148} 422}
149 423
150/* 424/*
@@ -154,127 +428,132 @@ nsm_create(void)
154 * Status Monitor wire protocol. 428 * Status Monitor wire protocol.
155 */ 429 */
156 430
157static __be32 *xdr_encode_nsm_string(__be32 *p, char *string) 431static int encode_nsm_string(struct xdr_stream *xdr, const char *string)
158{ 432{
159 size_t len = strlen(string); 433 const u32 len = strlen(string);
160 434 __be32 *p;
161 if (len > SM_MAXSTRLEN) 435
162 len = SM_MAXSTRLEN; 436 if (unlikely(len > SM_MAXSTRLEN))
163 return xdr_encode_opaque(p, string, len); 437 return -EIO;
438 p = xdr_reserve_space(xdr, sizeof(u32) + len);
439 if (unlikely(p == NULL))
440 return -EIO;
441 xdr_encode_opaque(p, string, len);
442 return 0;
164} 443}
165 444
166/* 445/*
167 * "mon_name" specifies the host to be monitored. 446 * "mon_name" specifies the host to be monitored.
168 *
169 * Linux uses a text version of the IP address of the remote
170 * host as the host identifier (the "mon_name" argument).
171 *
172 * Linux statd always looks up the canonical hostname first for
173 * whatever remote hostname it receives, so this works alright.
174 */ 447 */
175static __be32 *xdr_encode_mon_name(__be32 *p, struct nsm_args *argp) 448static int encode_mon_name(struct xdr_stream *xdr, const struct nsm_args *argp)
176{ 449{
177 char buffer[XDR_ADDRBUF_LEN + 1]; 450 return encode_nsm_string(xdr, argp->mon_name);
178 char *name = argp->mon_name;
179
180 if (!nsm_use_hostnames) {
181 snprintf(buffer, XDR_ADDRBUF_LEN,
182 "%pI4", &argp->addr);
183 name = buffer;
184 }
185
186 return xdr_encode_nsm_string(p, name);
187} 451}
188 452
189/* 453/*
190 * The "my_id" argument specifies the hostname and RPC procedure 454 * The "my_id" argument specifies the hostname and RPC procedure
191 * to be called when the status manager receives notification 455 * to be called when the status manager receives notification
192 * (via the SM_NOTIFY call) that the state of host "mon_name" 456 * (via the NLMPROC_SM_NOTIFY call) that the state of host "mon_name"
193 * has changed. 457 * has changed.
194 */ 458 */
195static __be32 *xdr_encode_my_id(__be32 *p, struct nsm_args *argp) 459static int encode_my_id(struct xdr_stream *xdr, const struct nsm_args *argp)
196{ 460{
197 p = xdr_encode_nsm_string(p, utsname()->nodename); 461 int status;
198 if (!p) 462 __be32 *p;
199 return ERR_PTR(-EIO); 463
200 464 status = encode_nsm_string(xdr, utsname()->nodename);
465 if (unlikely(status != 0))
466 return status;
467 p = xdr_reserve_space(xdr, 3 * sizeof(u32));
468 if (unlikely(p == NULL))
469 return -EIO;
201 *p++ = htonl(argp->prog); 470 *p++ = htonl(argp->prog);
202 *p++ = htonl(argp->vers); 471 *p++ = htonl(argp->vers);
203 *p++ = htonl(argp->proc); 472 *p++ = htonl(argp->proc);
204 473 return 0;
205 return p;
206} 474}
207 475
208/* 476/*
209 * The "mon_id" argument specifies the non-private arguments 477 * The "mon_id" argument specifies the non-private arguments
210 * of an SM_MON or SM_UNMON call. 478 * of an NSMPROC_MON or NSMPROC_UNMON call.
211 */ 479 */
212static __be32 *xdr_encode_mon_id(__be32 *p, struct nsm_args *argp) 480static int encode_mon_id(struct xdr_stream *xdr, const struct nsm_args *argp)
213{ 481{
214 p = xdr_encode_mon_name(p, argp); 482 int status;
215 if (!p)
216 return ERR_PTR(-EIO);
217 483
218 return xdr_encode_my_id(p, argp); 484 status = encode_mon_name(xdr, argp);
485 if (unlikely(status != 0))
486 return status;
487 return encode_my_id(xdr, argp);
219} 488}
220 489
221/* 490/*
222 * The "priv" argument may contain private information required 491 * The "priv" argument may contain private information required
223 * by the SM_MON call. This information will be supplied in the 492 * by the NSMPROC_MON call. This information will be supplied in the
224 * SM_NOTIFY call. 493 * NLMPROC_SM_NOTIFY call.
225 *
226 * Linux provides the raw IP address of the monitored host,
227 * left in network byte order.
228 */ 494 */
229static __be32 *xdr_encode_priv(__be32 *p, struct nsm_args *argp) 495static int encode_priv(struct xdr_stream *xdr, const struct nsm_args *argp)
230{ 496{
231 *p++ = argp->addr; 497 __be32 *p;
232 *p++ = 0;
233 *p++ = 0;
234 *p++ = 0;
235 498
236 return p; 499 p = xdr_reserve_space(xdr, SM_PRIV_SIZE);
500 if (unlikely(p == NULL))
501 return -EIO;
502 xdr_encode_opaque_fixed(p, argp->priv->data, SM_PRIV_SIZE);
503 return 0;
237} 504}
238 505
239static int 506static int xdr_enc_mon(struct rpc_rqst *req, __be32 *p,
240xdr_encode_mon(struct rpc_rqst *rqstp, __be32 *p, struct nsm_args *argp) 507 const struct nsm_args *argp)
241{ 508{
242 p = xdr_encode_mon_id(p, argp); 509 struct xdr_stream xdr;
243 if (IS_ERR(p)) 510 int status;
244 return PTR_ERR(p); 511
245 512 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
246 p = xdr_encode_priv(p, argp); 513 status = encode_mon_id(&xdr, argp);
247 if (IS_ERR(p)) 514 if (unlikely(status))
248 return PTR_ERR(p); 515 return status;
249 516 return encode_priv(&xdr, argp);
250 rqstp->rq_slen = xdr_adjust_iovec(rqstp->rq_svec, p);
251 return 0;
252} 517}
253 518
254static int 519static int xdr_enc_unmon(struct rpc_rqst *req, __be32 *p,
255xdr_encode_unmon(struct rpc_rqst *rqstp, __be32 *p, struct nsm_args *argp) 520 const struct nsm_args *argp)
256{ 521{
257 p = xdr_encode_mon_id(p, argp); 522 struct xdr_stream xdr;
258 if (IS_ERR(p)) 523
259 return PTR_ERR(p); 524 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
260 rqstp->rq_slen = xdr_adjust_iovec(rqstp->rq_svec, p); 525 return encode_mon_id(&xdr, argp);
261 return 0;
262} 526}
263 527
264static int 528static int xdr_dec_stat_res(struct rpc_rqst *rqstp, __be32 *p,
265xdr_decode_stat_res(struct rpc_rqst *rqstp, __be32 *p, struct nsm_res *resp) 529 struct nsm_res *resp)
266{ 530{
531 struct xdr_stream xdr;
532
533 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
534 p = xdr_inline_decode(&xdr, 2 * sizeof(u32));
535 if (unlikely(p == NULL))
536 return -EIO;
267 resp->status = ntohl(*p++); 537 resp->status = ntohl(*p++);
268 resp->state = ntohl(*p++); 538 resp->state = ntohl(*p);
269 dprintk("nsm: xdr_decode_stat_res status %d state %d\n", 539
540 dprintk("lockd: xdr_dec_stat_res status %d state %d\n",
270 resp->status, resp->state); 541 resp->status, resp->state);
271 return 0; 542 return 0;
272} 543}
273 544
274static int 545static int xdr_dec_stat(struct rpc_rqst *rqstp, __be32 *p,
275xdr_decode_stat(struct rpc_rqst *rqstp, __be32 *p, struct nsm_res *resp) 546 struct nsm_res *resp)
276{ 547{
277 resp->state = ntohl(*p++); 548 struct xdr_stream xdr;
549
550 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
551 p = xdr_inline_decode(&xdr, sizeof(u32));
552 if (unlikely(p == NULL))
553 return -EIO;
554 resp->state = ntohl(*p);
555
556 dprintk("lockd: xdr_dec_stat state %d\n", resp->state);
278 return 0; 557 return 0;
279} 558}
280 559
@@ -288,22 +567,22 @@ xdr_decode_stat(struct rpc_rqst *rqstp, __be32 *p, struct nsm_res *resp)
288#define SM_unmonres_sz 1 567#define SM_unmonres_sz 1
289 568
290static struct rpc_procinfo nsm_procedures[] = { 569static struct rpc_procinfo nsm_procedures[] = {
291[SM_MON] = { 570[NSMPROC_MON] = {
292 .p_proc = SM_MON, 571 .p_proc = NSMPROC_MON,
293 .p_encode = (kxdrproc_t) xdr_encode_mon, 572 .p_encode = (kxdrproc_t)xdr_enc_mon,
294 .p_decode = (kxdrproc_t) xdr_decode_stat_res, 573 .p_decode = (kxdrproc_t)xdr_dec_stat_res,
295 .p_arglen = SM_mon_sz, 574 .p_arglen = SM_mon_sz,
296 .p_replen = SM_monres_sz, 575 .p_replen = SM_monres_sz,
297 .p_statidx = SM_MON, 576 .p_statidx = NSMPROC_MON,
298 .p_name = "MONITOR", 577 .p_name = "MONITOR",
299 }, 578 },
300[SM_UNMON] = { 579[NSMPROC_UNMON] = {
301 .p_proc = SM_UNMON, 580 .p_proc = NSMPROC_UNMON,
302 .p_encode = (kxdrproc_t) xdr_encode_unmon, 581 .p_encode = (kxdrproc_t)xdr_enc_unmon,
303 .p_decode = (kxdrproc_t) xdr_decode_stat, 582 .p_decode = (kxdrproc_t)xdr_dec_stat,
304 .p_arglen = SM_mon_id_sz, 583 .p_arglen = SM_mon_id_sz,
305 .p_replen = SM_unmonres_sz, 584 .p_replen = SM_unmonres_sz,
306 .p_statidx = SM_UNMON, 585 .p_statidx = NSMPROC_UNMON,
307 .p_name = "UNMONITOR", 586 .p_name = "UNMONITOR",
308 }, 587 },
309}; 588};
@@ -322,7 +601,7 @@ static struct rpc_stat nsm_stats;
322 601
323static struct rpc_program nsm_program = { 602static struct rpc_program nsm_program = {
324 .name = "statd", 603 .name = "statd",
325 .number = SM_PROGRAM, 604 .number = NSM_PROGRAM,
326 .nrvers = ARRAY_SIZE(nsm_version), 605 .nrvers = ARRAY_SIZE(nsm_version),
327 .version = nsm_version, 606 .version = nsm_version,
328 .stats = &nsm_stats 607 .stats = &nsm_stats
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 252d80163d02..64f1c31b5853 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -35,7 +35,6 @@
35#include <linux/sunrpc/svcsock.h> 35#include <linux/sunrpc/svcsock.h>
36#include <net/ip.h> 36#include <net/ip.h>
37#include <linux/lockd/lockd.h> 37#include <linux/lockd/lockd.h>
38#include <linux/lockd/sm_inter.h>
39#include <linux/nfs.h> 38#include <linux/nfs.h>
40 39
41#define NLMDBG_FACILITY NLMDBG_SVC 40#define NLMDBG_FACILITY NLMDBG_SVC
@@ -54,13 +53,26 @@ static struct svc_rqst *nlmsvc_rqst;
54unsigned long nlmsvc_timeout; 53unsigned long nlmsvc_timeout;
55 54
56/* 55/*
56 * If the kernel has IPv6 support available, always listen for
57 * both AF_INET and AF_INET6 requests.
58 */
59#if (defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)) && \
60 defined(CONFIG_SUNRPC_REGISTER_V4)
61static const sa_family_t nlmsvc_family = AF_INET6;
62#else /* (CONFIG_IPV6 || CONFIG_IPV6_MODULE) && CONFIG_SUNRPC_REGISTER_V4 */
63static const sa_family_t nlmsvc_family = AF_INET;
64#endif /* (CONFIG_IPV6 || CONFIG_IPV6_MODULE) && CONFIG_SUNRPC_REGISTER_V4 */
65
66/*
57 * These can be set at insmod time (useful for NFS as root filesystem), 67 * These can be set at insmod time (useful for NFS as root filesystem),
58 * and also changed through the sysctl interface. -- Jamie Lokier, Aug 2003 68 * and also changed through the sysctl interface. -- Jamie Lokier, Aug 2003
59 */ 69 */
60static unsigned long nlm_grace_period; 70static unsigned long nlm_grace_period;
61static unsigned long nlm_timeout = LOCKD_DFLT_TIMEO; 71static unsigned long nlm_timeout = LOCKD_DFLT_TIMEO;
62static int nlm_udpport, nlm_tcpport; 72static int nlm_udpport, nlm_tcpport;
63int nsm_use_hostnames = 0; 73
74/* RLIM_NOFILE defaults to 1024. That seems like a reasonable default here. */
75static unsigned int nlm_max_connections = 1024;
64 76
65/* 77/*
66 * Constants needed for the sysctl interface. 78 * Constants needed for the sysctl interface.
@@ -143,6 +155,9 @@ lockd(void *vrqstp)
143 long timeout = MAX_SCHEDULE_TIMEOUT; 155 long timeout = MAX_SCHEDULE_TIMEOUT;
144 RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); 156 RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
145 157
158 /* update sv_maxconn if it has changed */
159 rqstp->rq_server->sv_maxconn = nlm_max_connections;
160
146 if (signalled()) { 161 if (signalled()) {
147 flush_signals(current); 162 flush_signals(current);
148 if (nlmsvc_ops) { 163 if (nlmsvc_ops) {
@@ -189,6 +204,19 @@ lockd(void *vrqstp)
189 return 0; 204 return 0;
190} 205}
191 206
207static int create_lockd_listener(struct svc_serv *serv, char *name,
208 unsigned short port)
209{
210 struct svc_xprt *xprt;
211
212 xprt = svc_find_xprt(serv, name, 0, 0);
213 if (xprt == NULL)
214 return svc_create_xprt(serv, name, port, SVC_SOCK_DEFAULTS);
215
216 svc_xprt_put(xprt);
217 return 0;
218}
219
192/* 220/*
193 * Ensure there are active UDP and TCP listeners for lockd. 221 * Ensure there are active UDP and TCP listeners for lockd.
194 * 222 *
@@ -202,29 +230,23 @@ lockd(void *vrqstp)
202static int make_socks(struct svc_serv *serv) 230static int make_socks(struct svc_serv *serv)
203{ 231{
204 static int warned; 232 static int warned;
205 struct svc_xprt *xprt; 233 int err;
206 int err = 0;
207 234
208 xprt = svc_find_xprt(serv, "udp", 0, 0); 235 err = create_lockd_listener(serv, "udp", nlm_udpport);
209 if (!xprt) 236 if (err < 0)
210 err = svc_create_xprt(serv, "udp", nlm_udpport, 237 goto out_err;
211 SVC_SOCK_DEFAULTS); 238
212 else 239 err = create_lockd_listener(serv, "tcp", nlm_tcpport);
213 svc_xprt_put(xprt); 240 if (err < 0)
214 if (err >= 0) { 241 goto out_err;
215 xprt = svc_find_xprt(serv, "tcp", 0, 0); 242
216 if (!xprt) 243 warned = 0;
217 err = svc_create_xprt(serv, "tcp", nlm_tcpport, 244 return 0;
218 SVC_SOCK_DEFAULTS); 245
219 else 246out_err:
220 svc_xprt_put(xprt); 247 if (warned++ == 0)
221 }
222 if (err >= 0) {
223 warned = 0;
224 err = 0;
225 } else if (warned++ == 0)
226 printk(KERN_WARNING 248 printk(KERN_WARNING
227 "lockd_up: makesock failed, error=%d\n", err); 249 "lockd_up: makesock failed, error=%d\n", err);
228 return err; 250 return err;
229} 251}
230 252
@@ -252,7 +274,7 @@ int lockd_up(void)
252 "lockd_up: no pid, %d users??\n", nlmsvc_users); 274 "lockd_up: no pid, %d users??\n", nlmsvc_users);
253 275
254 error = -ENOMEM; 276 error = -ENOMEM;
255 serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, AF_INET, NULL); 277 serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, nlmsvc_family, NULL);
256 if (!serv) { 278 if (!serv) {
257 printk(KERN_WARNING "lockd_up: create service failed\n"); 279 printk(KERN_WARNING "lockd_up: create service failed\n");
258 goto out; 280 goto out;
@@ -276,6 +298,7 @@ int lockd_up(void)
276 } 298 }
277 299
278 svc_sock_update_bufs(serv); 300 svc_sock_update_bufs(serv);
301 serv->sv_maxconn = nlm_max_connections;
279 302
280 nlmsvc_task = kthread_run(lockd, nlmsvc_rqst, serv->sv_name); 303 nlmsvc_task = kthread_run(lockd, nlmsvc_rqst, serv->sv_name);
281 if (IS_ERR(nlmsvc_task)) { 304 if (IS_ERR(nlmsvc_task)) {
@@ -485,6 +508,7 @@ module_param_call(nlm_udpport, param_set_port, param_get_int,
485module_param_call(nlm_tcpport, param_set_port, param_get_int, 508module_param_call(nlm_tcpport, param_set_port, param_get_int,
486 &nlm_tcpport, 0644); 509 &nlm_tcpport, 0644);
487module_param(nsm_use_hostnames, bool, 0644); 510module_param(nsm_use_hostnames, bool, 0644);
511module_param(nlm_max_connections, uint, 0644);
488 512
489/* 513/*
490 * Initialising and terminating the module. 514 * Initialising and terminating the module.
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 4dfdcbc6bf68..1725037374c5 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -16,8 +16,6 @@
16#include <linux/nfsd/nfsd.h> 16#include <linux/nfsd/nfsd.h>
17#include <linux/lockd/lockd.h> 17#include <linux/lockd/lockd.h>
18#include <linux/lockd/share.h> 18#include <linux/lockd/share.h>
19#include <linux/lockd/sm_inter.h>
20
21 19
22#define NLMDBG_FACILITY NLMDBG_CLIENT 20#define NLMDBG_FACILITY NLMDBG_CLIENT
23 21
@@ -419,8 +417,6 @@ static __be32
419nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp, 417nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
420 void *resp) 418 void *resp)
421{ 419{
422 struct sockaddr_in saddr;
423
424 dprintk("lockd: SM_NOTIFY called\n"); 420 dprintk("lockd: SM_NOTIFY called\n");
425 421
426 if (!nlm_privileged_requester(rqstp)) { 422 if (!nlm_privileged_requester(rqstp)) {
@@ -430,14 +426,7 @@ nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
430 return rpc_system_err; 426 return rpc_system_err;
431 } 427 }
432 428
433 /* Obtain the host pointer for this NFS server and try to 429 nlm_host_rebooted(argp);
434 * reclaim all locks we hold on this server.
435 */
436 memset(&saddr, 0, sizeof(saddr));
437 saddr.sin_family = AF_INET;
438 saddr.sin_addr.s_addr = argp->addr;
439 nlm_host_rebooted(&saddr, argp->mon, argp->len, argp->state);
440
441 return rpc_success; 430 return rpc_success;
442} 431}
443 432
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 3ca89e2a9381..3688e55901fc 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -16,8 +16,6 @@
16#include <linux/nfsd/nfsd.h> 16#include <linux/nfsd/nfsd.h>
17#include <linux/lockd/lockd.h> 17#include <linux/lockd/lockd.h>
18#include <linux/lockd/share.h> 18#include <linux/lockd/share.h>
19#include <linux/lockd/sm_inter.h>
20
21 19
22#define NLMDBG_FACILITY NLMDBG_CLIENT 20#define NLMDBG_FACILITY NLMDBG_CLIENT
23 21
@@ -451,8 +449,6 @@ static __be32
451nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp, 449nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
452 void *resp) 450 void *resp)
453{ 451{
454 struct sockaddr_in saddr;
455
456 dprintk("lockd: SM_NOTIFY called\n"); 452 dprintk("lockd: SM_NOTIFY called\n");
457 453
458 if (!nlm_privileged_requester(rqstp)) { 454 if (!nlm_privileged_requester(rqstp)) {
@@ -462,14 +458,7 @@ nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
462 return rpc_system_err; 458 return rpc_system_err;
463 } 459 }
464 460
465 /* Obtain the host pointer for this NFS server and try to 461 nlm_host_rebooted(argp);
466 * reclaim all locks we hold on this server.
467 */
468 memset(&saddr, 0, sizeof(saddr));
469 saddr.sin_family = AF_INET;
470 saddr.sin_addr.s_addr = argp->addr;
471 nlm_host_rebooted(&saddr, argp->mon, argp->len, argp->state);
472
473 return rpc_success; 462 return rpc_success;
474} 463}
475 464
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index 34c2766e27c7..9e4d6aab611b 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -17,7 +17,6 @@
17#include <linux/nfsd/export.h> 17#include <linux/nfsd/export.h>
18#include <linux/lockd/lockd.h> 18#include <linux/lockd/lockd.h>
19#include <linux/lockd/share.h> 19#include <linux/lockd/share.h>
20#include <linux/lockd/sm_inter.h>
21#include <linux/module.h> 20#include <linux/module.h>
22#include <linux/mount.h> 21#include <linux/mount.h>
23 22
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 1f226290c67c..0336f2beacde 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -16,7 +16,6 @@
16#include <linux/sunrpc/svc.h> 16#include <linux/sunrpc/svc.h>
17#include <linux/sunrpc/stats.h> 17#include <linux/sunrpc/stats.h>
18#include <linux/lockd/lockd.h> 18#include <linux/lockd/lockd.h>
19#include <linux/lockd/sm_inter.h>
20 19
21#define NLMDBG_FACILITY NLMDBG_XDR 20#define NLMDBG_FACILITY NLMDBG_XDR
22 21
@@ -349,8 +348,8 @@ nlmsvc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp)
349 if (!(p = xdr_decode_string_inplace(p, &argp->mon, &argp->len, SM_MAXSTRLEN))) 348 if (!(p = xdr_decode_string_inplace(p, &argp->mon, &argp->len, SM_MAXSTRLEN)))
350 return 0; 349 return 0;
351 argp->state = ntohl(*p++); 350 argp->state = ntohl(*p++);
352 /* Preserve the address in network byte order */ 351 memcpy(&argp->priv.data, p, sizeof(argp->priv.data));
353 argp->addr = *p++; 352 p += XDR_QUADLEN(SM_PRIV_SIZE);
354 return xdr_argsize_check(rqstp, p); 353 return xdr_argsize_check(rqstp, p);
355} 354}
356 355
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index 50c493a8ad8e..e1d528653192 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c
@@ -17,7 +17,6 @@
17#include <linux/sunrpc/svc.h> 17#include <linux/sunrpc/svc.h>
18#include <linux/sunrpc/stats.h> 18#include <linux/sunrpc/stats.h>
19#include <linux/lockd/lockd.h> 19#include <linux/lockd/lockd.h>
20#include <linux/lockd/sm_inter.h>
21 20
22#define NLMDBG_FACILITY NLMDBG_XDR 21#define NLMDBG_FACILITY NLMDBG_XDR
23 22
@@ -356,8 +355,8 @@ nlm4svc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp
356 if (!(p = xdr_decode_string_inplace(p, &argp->mon, &argp->len, SM_MAXSTRLEN))) 355 if (!(p = xdr_decode_string_inplace(p, &argp->mon, &argp->len, SM_MAXSTRLEN)))
357 return 0; 356 return 0;
358 argp->state = ntohl(*p++); 357 argp->state = ntohl(*p++);
359 /* Preserve the address in network byte order */ 358 memcpy(&argp->priv.data, p, sizeof(argp->priv.data));
360 argp->addr = *p++; 359 p += XDR_QUADLEN(SM_PRIV_SIZE);
361 return xdr_argsize_check(rqstp, p); 360 return xdr_argsize_check(rqstp, p);
362} 361}
363 362
diff --git a/fs/locks.c b/fs/locks.c
index 46a2e12f7d42..ec3deea29e37 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1564,7 +1564,7 @@ EXPORT_SYMBOL(flock_lock_file_wait);
1564 * %LOCK_MAND can be combined with %LOCK_READ or %LOCK_WRITE to allow other 1564 * %LOCK_MAND can be combined with %LOCK_READ or %LOCK_WRITE to allow other
1565 * processes read and write access respectively. 1565 * processes read and write access respectively.
1566 */ 1566 */
1567asmlinkage long sys_flock(unsigned int fd, unsigned int cmd) 1567SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
1568{ 1568{
1569 struct file *filp; 1569 struct file *filp;
1570 struct file_lock *lock; 1570 struct file_lock *lock;
diff --git a/fs/minix/Kconfig b/fs/minix/Kconfig
new file mode 100644
index 000000000000..0fd7ca994264
--- /dev/null
+++ b/fs/minix/Kconfig
@@ -0,0 +1,17 @@
1config MINIX_FS
2 tristate "Minix file system support"
3 depends on BLOCK
4 help
5 Minix is a simple operating system used in many classes about OS's.
6 The minix file system (method to organize files on a hard disk
7 partition or a floppy disk) was the original file system for Linux,
8 but has been superseded by the second extended file system ext2fs.
9 You don't want to use the minix file system on your hard disk
10 because of certain built-in restrictions, but it is sometimes found
11 on older Linux floppy disks. This option will enlarge your kernel
12 by about 28 KB. If unsure, say N.
13
14 To compile this file system support as a module, choose M here: the
15 module will be called minix. Note that the file system of your root
16 partition (the one containing the directory /) cannot be compiled as
17 a module.
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index f70433816a38..d4946c4c90e2 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -280,7 +280,7 @@ int minix_add_link(struct dentry *dentry, struct inode *inode)
280 return -EINVAL; 280 return -EINVAL;
281 281
282got_it: 282got_it:
283 pos = (page->index >> PAGE_CACHE_SHIFT) + p - (char*)page_address(page); 283 pos = page_offset(page) + p - (char *)page_address(page);
284 err = __minix_write_begin(NULL, page->mapping, pos, sbi->s_dirsize, 284 err = __minix_write_begin(NULL, page->mapping, pos, sbi->s_dirsize,
285 AOP_FLAG_UNINTERRUPTIBLE, &page, NULL); 285 AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
286 if (err) 286 if (err)
diff --git a/fs/mpage.c b/fs/mpage.c
index 552b80b3facc..16c3ef37eae3 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -241,7 +241,6 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
241 first_hole = page_block; 241 first_hole = page_block;
242 page_block++; 242 page_block++;
243 block_in_file++; 243 block_in_file++;
244 clear_buffer_mapped(map_bh);
245 continue; 244 continue;
246 } 245 }
247 246
@@ -308,7 +307,10 @@ alloc_new:
308 goto alloc_new; 307 goto alloc_new;
309 } 308 }
310 309
311 if (buffer_boundary(map_bh) || (first_hole != blocks_per_page)) 310 relative_block = block_in_file - *first_logical_block;
311 nblocks = map_bh->b_size >> blkbits;
312 if ((buffer_boundary(map_bh) && relative_block == nblocks) ||
313 (first_hole != blocks_per_page))
312 bio = mpage_bio_submit(READ, bio); 314 bio = mpage_bio_submit(READ, bio);
313 else 315 else
314 *last_block_in_bio = blocks[blocks_per_page - 1]; 316 *last_block_in_bio = blocks[blocks_per_page - 1];
diff --git a/fs/namei.c b/fs/namei.c
index 734f2b5591bf..199317642ad6 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -227,6 +227,16 @@ int generic_permission(struct inode *inode, int mask,
227 return -EACCES; 227 return -EACCES;
228} 228}
229 229
230/**
231 * inode_permission - check for access rights to a given inode
232 * @inode: inode to check permission on
233 * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
234 *
235 * Used to check for read/write/execute permissions on an inode.
236 * We use "fsuid" for this, letting us set arbitrary permissions
237 * for filesystem access without changing the "normal" uids which
238 * are used for other things.
239 */
230int inode_permission(struct inode *inode, int mask) 240int inode_permission(struct inode *inode, int mask)
231{ 241{
232 int retval; 242 int retval;
@@ -248,8 +258,7 @@ int inode_permission(struct inode *inode, int mask)
248 return -EACCES; 258 return -EACCES;
249 } 259 }
250 260
251 /* Ordinary permission routines do not understand MAY_APPEND. */ 261 if (inode->i_op->permission)
252 if (inode->i_op && inode->i_op->permission)
253 retval = inode->i_op->permission(inode, mask); 262 retval = inode->i_op->permission(inode, mask);
254 else 263 else
255 retval = generic_permission(inode, mask, NULL); 264 retval = generic_permission(inode, mask, NULL);
@@ -266,21 +275,6 @@ int inode_permission(struct inode *inode, int mask)
266} 275}
267 276
268/** 277/**
269 * vfs_permission - check for access rights to a given path
270 * @nd: lookup result that describes the path
271 * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
272 *
273 * Used to check for read/write/execute permissions on a path.
274 * We use "fsuid" for this, letting us set arbitrary permissions
275 * for filesystem access without changing the "normal" uids which
276 * are used for other things.
277 */
278int vfs_permission(struct nameidata *nd, int mask)
279{
280 return inode_permission(nd->path.dentry->d_inode, mask);
281}
282
283/**
284 * file_permission - check for additional access rights to a given file 278 * file_permission - check for additional access rights to a given file
285 * @file: file to check access rights for 279 * @file: file to check access rights for
286 * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) 280 * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
@@ -290,7 +284,7 @@ int vfs_permission(struct nameidata *nd, int mask)
290 * 284 *
291 * Note: 285 * Note:
292 * Do not use this function in new code. All access checks should 286 * Do not use this function in new code. All access checks should
293 * be done using vfs_permission(). 287 * be done using inode_permission().
294 */ 288 */
295int file_permission(struct file *file, int mask) 289int file_permission(struct file *file, int mask)
296{ 290{
@@ -439,7 +433,7 @@ static int exec_permission_lite(struct inode *inode)
439{ 433{
440 umode_t mode = inode->i_mode; 434 umode_t mode = inode->i_mode;
441 435
442 if (inode->i_op && inode->i_op->permission) 436 if (inode->i_op->permission)
443 return -EAGAIN; 437 return -EAGAIN;
444 438
445 if (current_fsuid() == inode->i_uid) 439 if (current_fsuid() == inode->i_uid)
@@ -528,18 +522,6 @@ out_unlock:
528 return result; 522 return result;
529} 523}
530 524
531/* SMP-safe */
532static __always_inline void
533walk_init_root(const char *name, struct nameidata *nd)
534{
535 struct fs_struct *fs = current->fs;
536
537 read_lock(&fs->lock);
538 nd->path = fs->root;
539 path_get(&fs->root);
540 read_unlock(&fs->lock);
541}
542
543/* 525/*
544 * Wrapper to retry pathname resolution whenever the underlying 526 * Wrapper to retry pathname resolution whenever the underlying
545 * file system returns an ESTALE. 527 * file system returns an ESTALE.
@@ -577,9 +559,16 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l
577 goto fail; 559 goto fail;
578 560
579 if (*link == '/') { 561 if (*link == '/') {
562 struct fs_struct *fs = current->fs;
563
580 path_put(&nd->path); 564 path_put(&nd->path);
581 walk_init_root(link, nd); 565
566 read_lock(&fs->lock);
567 nd->path = fs->root;
568 path_get(&fs->root);
569 read_unlock(&fs->lock);
582 } 570 }
571
583 res = link_path_walk(link, nd); 572 res = link_path_walk(link, nd);
584 if (nd->depth || res || nd->last_type!=LAST_NORM) 573 if (nd->depth || res || nd->last_type!=LAST_NORM)
585 return res; 574 return res;
@@ -860,7 +849,8 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
860 nd->flags |= LOOKUP_CONTINUE; 849 nd->flags |= LOOKUP_CONTINUE;
861 err = exec_permission_lite(inode); 850 err = exec_permission_lite(inode);
862 if (err == -EAGAIN) 851 if (err == -EAGAIN)
863 err = vfs_permission(nd, MAY_EXEC); 852 err = inode_permission(nd->path.dentry->d_inode,
853 MAY_EXEC);
864 if (!err) 854 if (!err)
865 err = ima_path_check(&nd->path, MAY_EXEC); 855 err = ima_path_check(&nd->path, MAY_EXEC);
866 if (err) 856 if (err)
@@ -921,9 +911,6 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
921 inode = next.dentry->d_inode; 911 inode = next.dentry->d_inode;
922 if (!inode) 912 if (!inode)
923 goto out_dput; 913 goto out_dput;
924 err = -ENOTDIR;
925 if (!inode->i_op)
926 goto out_dput;
927 914
928 if (inode->i_op->follow_link) { 915 if (inode->i_op->follow_link) {
929 err = do_follow_link(&next, nd); 916 err = do_follow_link(&next, nd);
@@ -933,9 +920,6 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
933 inode = nd->path.dentry->d_inode; 920 inode = nd->path.dentry->d_inode;
934 if (!inode) 921 if (!inode)
935 break; 922 break;
936 err = -ENOTDIR;
937 if (!inode->i_op)
938 break;
939 } else 923 } else
940 path_to_nameidata(&next, nd); 924 path_to_nameidata(&next, nd);
941 err = -ENOTDIR; 925 err = -ENOTDIR;
@@ -974,7 +958,7 @@ last_component:
974 break; 958 break;
975 inode = next.dentry->d_inode; 959 inode = next.dentry->d_inode;
976 if ((lookup_flags & LOOKUP_FOLLOW) 960 if ((lookup_flags & LOOKUP_FOLLOW)
977 && inode && inode->i_op && inode->i_op->follow_link) { 961 && inode && inode->i_op->follow_link) {
978 err = do_follow_link(&next, nd); 962 err = do_follow_link(&next, nd);
979 if (err) 963 if (err)
980 goto return_err; 964 goto return_err;
@@ -986,7 +970,7 @@ last_component:
986 break; 970 break;
987 if (lookup_flags & LOOKUP_DIRECTORY) { 971 if (lookup_flags & LOOKUP_DIRECTORY) {
988 err = -ENOTDIR; 972 err = -ENOTDIR;
989 if (!inode->i_op || !inode->i_op->lookup) 973 if (!inode->i_op->lookup)
990 break; 974 break;
991 } 975 }
992 goto return_base; 976 goto return_base;
@@ -1482,7 +1466,7 @@ int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
1482 if (error) 1466 if (error)
1483 return error; 1467 return error;
1484 1468
1485 if (!dir->i_op || !dir->i_op->create) 1469 if (!dir->i_op->create)
1486 return -EACCES; /* shouldn't it be ENOSYS? */ 1470 return -EACCES; /* shouldn't it be ENOSYS? */
1487 mode &= S_IALLUGO; 1471 mode &= S_IALLUGO;
1488 mode |= S_IFREG; 1472 mode |= S_IFREG;
@@ -1496,9 +1480,9 @@ int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
1496 return error; 1480 return error;
1497} 1481}
1498 1482
1499int may_open(struct nameidata *nd, int acc_mode, int flag) 1483int may_open(struct path *path, int acc_mode, int flag)
1500{ 1484{
1501 struct dentry *dentry = nd->path.dentry; 1485 struct dentry *dentry = path->dentry;
1502 struct inode *inode = dentry->d_inode; 1486 struct inode *inode = dentry->d_inode;
1503 int error; 1487 int error;
1504 1488
@@ -1519,17 +1503,17 @@ int may_open(struct nameidata *nd, int acc_mode, int flag)
1519 if (S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { 1503 if (S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
1520 flag &= ~O_TRUNC; 1504 flag &= ~O_TRUNC;
1521 } else if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode)) { 1505 } else if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode)) {
1522 if (nd->path.mnt->mnt_flags & MNT_NODEV) 1506 if (path->mnt->mnt_flags & MNT_NODEV)
1523 return -EACCES; 1507 return -EACCES;
1524 1508
1525 flag &= ~O_TRUNC; 1509 flag &= ~O_TRUNC;
1526 } 1510 }
1527 1511
1528 error = vfs_permission(nd, acc_mode); 1512 error = inode_permission(inode, acc_mode);
1529 if (error) 1513 if (error)
1530 return error; 1514 return error;
1531 1515
1532 error = ima_path_check(&nd->path, 1516 error = ima_path_check(path,
1533 acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC)); 1517 acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC));
1534 if (error) 1518 if (error)
1535 return error; 1519 return error;
@@ -1564,6 +1548,9 @@ int may_open(struct nameidata *nd, int acc_mode, int flag)
1564 * Refuse to truncate files with mandatory locks held on them. 1548 * Refuse to truncate files with mandatory locks held on them.
1565 */ 1549 */
1566 error = locks_verify_locked(inode); 1550 error = locks_verify_locked(inode);
1551 if (!error)
1552 error = security_path_truncate(path, 0,
1553 ATTR_MTIME|ATTR_CTIME|ATTR_OPEN);
1567 if (!error) { 1554 if (!error) {
1568 DQUOT_INIT(inode); 1555 DQUOT_INIT(inode);
1569 1556
@@ -1594,14 +1581,18 @@ static int __open_namei_create(struct nameidata *nd, struct path *path,
1594 1581
1595 if (!IS_POSIXACL(dir->d_inode)) 1582 if (!IS_POSIXACL(dir->d_inode))
1596 mode &= ~current->fs->umask; 1583 mode &= ~current->fs->umask;
1584 error = security_path_mknod(&nd->path, path->dentry, mode, 0);
1585 if (error)
1586 goto out_unlock;
1597 error = vfs_create(dir->d_inode, path->dentry, mode, nd); 1587 error = vfs_create(dir->d_inode, path->dentry, mode, nd);
1588out_unlock:
1598 mutex_unlock(&dir->d_inode->i_mutex); 1589 mutex_unlock(&dir->d_inode->i_mutex);
1599 dput(nd->path.dentry); 1590 dput(nd->path.dentry);
1600 nd->path.dentry = path->dentry; 1591 nd->path.dentry = path->dentry;
1601 if (error) 1592 if (error)
1602 return error; 1593 return error;
1603 /* Don't check for write permission, don't truncate */ 1594 /* Don't check for write permission, don't truncate */
1604 return may_open(nd, 0, flag & ~O_TRUNC); 1595 return may_open(&nd->path, 0, flag & ~O_TRUNC);
1605} 1596}
1606 1597
1607/* 1598/*
@@ -1763,7 +1754,7 @@ do_last:
1763 error = -ENOENT; 1754 error = -ENOENT;
1764 if (!path.dentry->d_inode) 1755 if (!path.dentry->d_inode)
1765 goto exit_dput; 1756 goto exit_dput;
1766 if (path.dentry->d_inode->i_op && path.dentry->d_inode->i_op->follow_link) 1757 if (path.dentry->d_inode->i_op->follow_link)
1767 goto do_link; 1758 goto do_link;
1768 1759
1769 path_to_nameidata(&path, &nd); 1760 path_to_nameidata(&path, &nd);
@@ -1787,7 +1778,7 @@ ok:
1787 if (error) 1778 if (error)
1788 goto exit; 1779 goto exit;
1789 } 1780 }
1790 error = may_open(&nd, acc_mode, flag); 1781 error = may_open(&nd.path, acc_mode, flag);
1791 if (error) { 1782 if (error) {
1792 if (will_write) 1783 if (will_write)
1793 mnt_drop_write(nd.path.mnt); 1784 mnt_drop_write(nd.path.mnt);
@@ -1944,7 +1935,7 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
1944 if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD)) 1935 if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD))
1945 return -EPERM; 1936 return -EPERM;
1946 1937
1947 if (!dir->i_op || !dir->i_op->mknod) 1938 if (!dir->i_op->mknod)
1948 return -EPERM; 1939 return -EPERM;
1949 1940
1950 error = devcgroup_inode_mknod(mode, dev); 1941 error = devcgroup_inode_mknod(mode, dev);
@@ -1979,8 +1970,8 @@ static int may_mknod(mode_t mode)
1979 } 1970 }
1980} 1971}
1981 1972
1982asmlinkage long sys_mknodat(int dfd, const char __user *filename, int mode, 1973SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, int, mode,
1983 unsigned dev) 1974 unsigned, dev)
1984{ 1975{
1985 int error; 1976 int error;
1986 char *tmp; 1977 char *tmp;
@@ -2007,6 +1998,9 @@ asmlinkage long sys_mknodat(int dfd, const char __user *filename, int mode,
2007 error = mnt_want_write(nd.path.mnt); 1998 error = mnt_want_write(nd.path.mnt);
2008 if (error) 1999 if (error)
2009 goto out_dput; 2000 goto out_dput;
2001 error = security_path_mknod(&nd.path, dentry, mode, dev);
2002 if (error)
2003 goto out_drop_write;
2010 switch (mode & S_IFMT) { 2004 switch (mode & S_IFMT) {
2011 case 0: case S_IFREG: 2005 case 0: case S_IFREG:
2012 error = vfs_create(nd.path.dentry->d_inode,dentry,mode,&nd); 2006 error = vfs_create(nd.path.dentry->d_inode,dentry,mode,&nd);
@@ -2019,6 +2013,7 @@ asmlinkage long sys_mknodat(int dfd, const char __user *filename, int mode,
2019 error = vfs_mknod(nd.path.dentry->d_inode,dentry,mode,0); 2013 error = vfs_mknod(nd.path.dentry->d_inode,dentry,mode,0);
2020 break; 2014 break;
2021 } 2015 }
2016out_drop_write:
2022 mnt_drop_write(nd.path.mnt); 2017 mnt_drop_write(nd.path.mnt);
2023out_dput: 2018out_dput:
2024 dput(dentry); 2019 dput(dentry);
@@ -2030,7 +2025,7 @@ out_unlock:
2030 return error; 2025 return error;
2031} 2026}
2032 2027
2033asmlinkage long sys_mknod(const char __user *filename, int mode, unsigned dev) 2028SYSCALL_DEFINE3(mknod, const char __user *, filename, int, mode, unsigned, dev)
2034{ 2029{
2035 return sys_mknodat(AT_FDCWD, filename, mode, dev); 2030 return sys_mknodat(AT_FDCWD, filename, mode, dev);
2036} 2031}
@@ -2042,7 +2037,7 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
2042 if (error) 2037 if (error)
2043 return error; 2038 return error;
2044 2039
2045 if (!dir->i_op || !dir->i_op->mkdir) 2040 if (!dir->i_op->mkdir)
2046 return -EPERM; 2041 return -EPERM;
2047 2042
2048 mode &= (S_IRWXUGO|S_ISVTX); 2043 mode &= (S_IRWXUGO|S_ISVTX);
@@ -2057,7 +2052,7 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
2057 return error; 2052 return error;
2058} 2053}
2059 2054
2060asmlinkage long sys_mkdirat(int dfd, const char __user *pathname, int mode) 2055SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, int, mode)
2061{ 2056{
2062 int error = 0; 2057 int error = 0;
2063 char * tmp; 2058 char * tmp;
@@ -2078,7 +2073,11 @@ asmlinkage long sys_mkdirat(int dfd, const char __user *pathname, int mode)
2078 error = mnt_want_write(nd.path.mnt); 2073 error = mnt_want_write(nd.path.mnt);
2079 if (error) 2074 if (error)
2080 goto out_dput; 2075 goto out_dput;
2076 error = security_path_mkdir(&nd.path, dentry, mode);
2077 if (error)
2078 goto out_drop_write;
2081 error = vfs_mkdir(nd.path.dentry->d_inode, dentry, mode); 2079 error = vfs_mkdir(nd.path.dentry->d_inode, dentry, mode);
2080out_drop_write:
2082 mnt_drop_write(nd.path.mnt); 2081 mnt_drop_write(nd.path.mnt);
2083out_dput: 2082out_dput:
2084 dput(dentry); 2083 dput(dentry);
@@ -2090,7 +2089,7 @@ out_err:
2090 return error; 2089 return error;
2091} 2090}
2092 2091
2093asmlinkage long sys_mkdir(const char __user *pathname, int mode) 2092SYSCALL_DEFINE2(mkdir, const char __user *, pathname, int, mode)
2094{ 2093{
2095 return sys_mkdirat(AT_FDCWD, pathname, mode); 2094 return sys_mkdirat(AT_FDCWD, pathname, mode);
2096} 2095}
@@ -2129,7 +2128,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
2129 if (error) 2128 if (error)
2130 return error; 2129 return error;
2131 2130
2132 if (!dir->i_op || !dir->i_op->rmdir) 2131 if (!dir->i_op->rmdir)
2133 return -EPERM; 2132 return -EPERM;
2134 2133
2135 DQUOT_INIT(dir); 2134 DQUOT_INIT(dir);
@@ -2188,7 +2187,11 @@ static long do_rmdir(int dfd, const char __user *pathname)
2188 error = mnt_want_write(nd.path.mnt); 2187 error = mnt_want_write(nd.path.mnt);
2189 if (error) 2188 if (error)
2190 goto exit3; 2189 goto exit3;
2190 error = security_path_rmdir(&nd.path, dentry);
2191 if (error)
2192 goto exit4;
2191 error = vfs_rmdir(nd.path.dentry->d_inode, dentry); 2193 error = vfs_rmdir(nd.path.dentry->d_inode, dentry);
2194exit4:
2192 mnt_drop_write(nd.path.mnt); 2195 mnt_drop_write(nd.path.mnt);
2193exit3: 2196exit3:
2194 dput(dentry); 2197 dput(dentry);
@@ -2200,7 +2203,7 @@ exit1:
2200 return error; 2203 return error;
2201} 2204}
2202 2205
2203asmlinkage long sys_rmdir(const char __user *pathname) 2206SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
2204{ 2207{
2205 return do_rmdir(AT_FDCWD, pathname); 2208 return do_rmdir(AT_FDCWD, pathname);
2206} 2209}
@@ -2212,7 +2215,7 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry)
2212 if (error) 2215 if (error)
2213 return error; 2216 return error;
2214 2217
2215 if (!dir->i_op || !dir->i_op->unlink) 2218 if (!dir->i_op->unlink)
2216 return -EPERM; 2219 return -EPERM;
2217 2220
2218 DQUOT_INIT(dir); 2221 DQUOT_INIT(dir);
@@ -2273,7 +2276,11 @@ static long do_unlinkat(int dfd, const char __user *pathname)
2273 error = mnt_want_write(nd.path.mnt); 2276 error = mnt_want_write(nd.path.mnt);
2274 if (error) 2277 if (error)
2275 goto exit2; 2278 goto exit2;
2279 error = security_path_unlink(&nd.path, dentry);
2280 if (error)
2281 goto exit3;
2276 error = vfs_unlink(nd.path.dentry->d_inode, dentry); 2282 error = vfs_unlink(nd.path.dentry->d_inode, dentry);
2283exit3:
2277 mnt_drop_write(nd.path.mnt); 2284 mnt_drop_write(nd.path.mnt);
2278 exit2: 2285 exit2:
2279 dput(dentry); 2286 dput(dentry);
@@ -2292,7 +2299,7 @@ slashes:
2292 goto exit2; 2299 goto exit2;
2293} 2300}
2294 2301
2295asmlinkage long sys_unlinkat(int dfd, const char __user *pathname, int flag) 2302SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag)
2296{ 2303{
2297 if ((flag & ~AT_REMOVEDIR) != 0) 2304 if ((flag & ~AT_REMOVEDIR) != 0)
2298 return -EINVAL; 2305 return -EINVAL;
@@ -2303,7 +2310,7 @@ asmlinkage long sys_unlinkat(int dfd, const char __user *pathname, int flag)
2303 return do_unlinkat(dfd, pathname); 2310 return do_unlinkat(dfd, pathname);
2304} 2311}
2305 2312
2306asmlinkage long sys_unlink(const char __user *pathname) 2313SYSCALL_DEFINE1(unlink, const char __user *, pathname)
2307{ 2314{
2308 return do_unlinkat(AT_FDCWD, pathname); 2315 return do_unlinkat(AT_FDCWD, pathname);
2309} 2316}
@@ -2315,7 +2322,7 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
2315 if (error) 2322 if (error)
2316 return error; 2323 return error;
2317 2324
2318 if (!dir->i_op || !dir->i_op->symlink) 2325 if (!dir->i_op->symlink)
2319 return -EPERM; 2326 return -EPERM;
2320 2327
2321 error = security_inode_symlink(dir, dentry, oldname); 2328 error = security_inode_symlink(dir, dentry, oldname);
@@ -2329,8 +2336,8 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
2329 return error; 2336 return error;
2330} 2337}
2331 2338
2332asmlinkage long sys_symlinkat(const char __user *oldname, 2339SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
2333 int newdfd, const char __user *newname) 2340 int, newdfd, const char __user *, newname)
2334{ 2341{
2335 int error; 2342 int error;
2336 char *from; 2343 char *from;
@@ -2354,7 +2361,11 @@ asmlinkage long sys_symlinkat(const char __user *oldname,
2354 error = mnt_want_write(nd.path.mnt); 2361 error = mnt_want_write(nd.path.mnt);
2355 if (error) 2362 if (error)
2356 goto out_dput; 2363 goto out_dput;
2364 error = security_path_symlink(&nd.path, dentry, from);
2365 if (error)
2366 goto out_drop_write;
2357 error = vfs_symlink(nd.path.dentry->d_inode, dentry, from); 2367 error = vfs_symlink(nd.path.dentry->d_inode, dentry, from);
2368out_drop_write:
2358 mnt_drop_write(nd.path.mnt); 2369 mnt_drop_write(nd.path.mnt);
2359out_dput: 2370out_dput:
2360 dput(dentry); 2371 dput(dentry);
@@ -2367,7 +2378,7 @@ out_putname:
2367 return error; 2378 return error;
2368} 2379}
2369 2380
2370asmlinkage long sys_symlink(const char __user *oldname, const char __user *newname) 2381SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newname)
2371{ 2382{
2372 return sys_symlinkat(oldname, AT_FDCWD, newname); 2383 return sys_symlinkat(oldname, AT_FDCWD, newname);
2373} 2384}
@@ -2392,7 +2403,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
2392 */ 2403 */
2393 if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) 2404 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
2394 return -EPERM; 2405 return -EPERM;
2395 if (!dir->i_op || !dir->i_op->link) 2406 if (!dir->i_op->link)
2396 return -EPERM; 2407 return -EPERM;
2397 if (S_ISDIR(inode->i_mode)) 2408 if (S_ISDIR(inode->i_mode))
2398 return -EPERM; 2409 return -EPERM;
@@ -2419,9 +2430,8 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
2419 * with linux 2.0, and to avoid hard-linking to directories 2430 * with linux 2.0, and to avoid hard-linking to directories
2420 * and other special files. --ADM 2431 * and other special files. --ADM
2421 */ 2432 */
2422asmlinkage long sys_linkat(int olddfd, const char __user *oldname, 2433SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
2423 int newdfd, const char __user *newname, 2434 int, newdfd, const char __user *, newname, int, flags)
2424 int flags)
2425{ 2435{
2426 struct dentry *new_dentry; 2436 struct dentry *new_dentry;
2427 struct nameidata nd; 2437 struct nameidata nd;
@@ -2451,7 +2461,11 @@ asmlinkage long sys_linkat(int olddfd, const char __user *oldname,
2451 error = mnt_want_write(nd.path.mnt); 2461 error = mnt_want_write(nd.path.mnt);
2452 if (error) 2462 if (error)
2453 goto out_dput; 2463 goto out_dput;
2464 error = security_path_link(old_path.dentry, &nd.path, new_dentry);
2465 if (error)
2466 goto out_drop_write;
2454 error = vfs_link(old_path.dentry, nd.path.dentry->d_inode, new_dentry); 2467 error = vfs_link(old_path.dentry, nd.path.dentry->d_inode, new_dentry);
2468out_drop_write:
2455 mnt_drop_write(nd.path.mnt); 2469 mnt_drop_write(nd.path.mnt);
2456out_dput: 2470out_dput:
2457 dput(new_dentry); 2471 dput(new_dentry);
@@ -2466,7 +2480,7 @@ out:
2466 return error; 2480 return error;
2467} 2481}
2468 2482
2469asmlinkage long sys_link(const char __user *oldname, const char __user *newname) 2483SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname)
2470{ 2484{
2471 return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0); 2485 return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
2472} 2486}
@@ -2595,7 +2609,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
2595 if (error) 2609 if (error)
2596 return error; 2610 return error;
2597 2611
2598 if (!old_dir->i_op || !old_dir->i_op->rename) 2612 if (!old_dir->i_op->rename)
2599 return -EPERM; 2613 return -EPERM;
2600 2614
2601 DQUOT_INIT(old_dir); 2615 DQUOT_INIT(old_dir);
@@ -2617,8 +2631,8 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
2617 return error; 2631 return error;
2618} 2632}
2619 2633
2620asmlinkage long sys_renameat(int olddfd, const char __user *oldname, 2634SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
2621 int newdfd, const char __user *newname) 2635 int, newdfd, const char __user *, newname)
2622{ 2636{
2623 struct dentry *old_dir, *new_dir; 2637 struct dentry *old_dir, *new_dir;
2624 struct dentry *old_dentry, *new_dentry; 2638 struct dentry *old_dentry, *new_dentry;
@@ -2687,8 +2701,13 @@ asmlinkage long sys_renameat(int olddfd, const char __user *oldname,
2687 error = mnt_want_write(oldnd.path.mnt); 2701 error = mnt_want_write(oldnd.path.mnt);
2688 if (error) 2702 if (error)
2689 goto exit5; 2703 goto exit5;
2704 error = security_path_rename(&oldnd.path, old_dentry,
2705 &newnd.path, new_dentry);
2706 if (error)
2707 goto exit6;
2690 error = vfs_rename(old_dir->d_inode, old_dentry, 2708 error = vfs_rename(old_dir->d_inode, old_dentry,
2691 new_dir->d_inode, new_dentry); 2709 new_dir->d_inode, new_dentry);
2710exit6:
2692 mnt_drop_write(oldnd.path.mnt); 2711 mnt_drop_write(oldnd.path.mnt);
2693exit5: 2712exit5:
2694 dput(new_dentry); 2713 dput(new_dentry);
@@ -2706,7 +2725,7 @@ exit:
2706 return error; 2725 return error;
2707} 2726}
2708 2727
2709asmlinkage long sys_rename(const char __user *oldname, const char __user *newname) 2728SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname)
2710{ 2729{
2711 return sys_renameat(AT_FDCWD, oldname, AT_FDCWD, newname); 2730 return sys_renameat(AT_FDCWD, oldname, AT_FDCWD, newname);
2712} 2731}
@@ -2758,13 +2777,16 @@ int vfs_follow_link(struct nameidata *nd, const char *link)
2758/* get the link contents into pagecache */ 2777/* get the link contents into pagecache */
2759static char *page_getlink(struct dentry * dentry, struct page **ppage) 2778static char *page_getlink(struct dentry * dentry, struct page **ppage)
2760{ 2779{
2761 struct page * page; 2780 char *kaddr;
2781 struct page *page;
2762 struct address_space *mapping = dentry->d_inode->i_mapping; 2782 struct address_space *mapping = dentry->d_inode->i_mapping;
2763 page = read_mapping_page(mapping, 0, NULL); 2783 page = read_mapping_page(mapping, 0, NULL);
2764 if (IS_ERR(page)) 2784 if (IS_ERR(page))
2765 return (char*)page; 2785 return (char*)page;
2766 *ppage = page; 2786 *ppage = page;
2767 return kmap(page); 2787 kaddr = kmap(page);
2788 nd_terminate_link(kaddr, dentry->d_inode->i_size, PAGE_SIZE - 1);
2789 return kaddr;
2768} 2790}
2769 2791
2770int page_readlink(struct dentry *dentry, char __user *buffer, int buflen) 2792int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
@@ -2796,18 +2818,23 @@ void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
2796 } 2818 }
2797} 2819}
2798 2820
2799int __page_symlink(struct inode *inode, const char *symname, int len, 2821/*
2800 gfp_t gfp_mask) 2822 * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS
2823 */
2824int __page_symlink(struct inode *inode, const char *symname, int len, int nofs)
2801{ 2825{
2802 struct address_space *mapping = inode->i_mapping; 2826 struct address_space *mapping = inode->i_mapping;
2803 struct page *page; 2827 struct page *page;
2804 void *fsdata; 2828 void *fsdata;
2805 int err; 2829 int err;
2806 char *kaddr; 2830 char *kaddr;
2831 unsigned int flags = AOP_FLAG_UNINTERRUPTIBLE;
2832 if (nofs)
2833 flags |= AOP_FLAG_NOFS;
2807 2834
2808retry: 2835retry:
2809 err = pagecache_write_begin(NULL, mapping, 0, len-1, 2836 err = pagecache_write_begin(NULL, mapping, 0, len-1,
2810 AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata); 2837 flags, &page, &fsdata);
2811 if (err) 2838 if (err)
2812 goto fail; 2839 goto fail;
2813 2840
@@ -2831,7 +2858,7 @@ fail:
2831int page_symlink(struct inode *inode, const char *symname, int len) 2858int page_symlink(struct inode *inode, const char *symname, int len)
2832{ 2859{
2833 return __page_symlink(inode, symname, len, 2860 return __page_symlink(inode, symname, len,
2834 mapping_gfp_mask(inode->i_mapping)); 2861 !(mapping_gfp_mask(inode->i_mapping) & __GFP_FS));
2835} 2862}
2836 2863
2837const struct inode_operations page_symlink_inode_operations = { 2864const struct inode_operations page_symlink_inode_operations = {
@@ -2857,7 +2884,6 @@ EXPORT_SYMBOL(path_lookup);
2857EXPORT_SYMBOL(kern_path); 2884EXPORT_SYMBOL(kern_path);
2858EXPORT_SYMBOL(vfs_path_lookup); 2885EXPORT_SYMBOL(vfs_path_lookup);
2859EXPORT_SYMBOL(inode_permission); 2886EXPORT_SYMBOL(inode_permission);
2860EXPORT_SYMBOL(vfs_permission);
2861EXPORT_SYMBOL(file_permission); 2887EXPORT_SYMBOL(file_permission);
2862EXPORT_SYMBOL(unlock_rename); 2888EXPORT_SYMBOL(unlock_rename);
2863EXPORT_SYMBOL(vfs_create); 2889EXPORT_SYMBOL(vfs_create);
@@ -2873,3 +2899,10 @@ EXPORT_SYMBOL(vfs_symlink);
2873EXPORT_SYMBOL(vfs_unlink); 2899EXPORT_SYMBOL(vfs_unlink);
2874EXPORT_SYMBOL(dentry_unhash); 2900EXPORT_SYMBOL(dentry_unhash);
2875EXPORT_SYMBOL(generic_readlink); 2901EXPORT_SYMBOL(generic_readlink);
2902
2903/* to be mentioned only in INIT_TASK */
2904struct fs_struct init_fs = {
2905 .count = ATOMIC_INIT(1),
2906 .lock = __RW_LOCK_UNLOCKED(init_fs.lock),
2907 .umask = 0022,
2908};
diff --git a/fs/namespace.c b/fs/namespace.c
index 1c09cab8f7cf..228d8c4bfd18 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1128,7 +1128,7 @@ static int do_umount(struct vfsmount *mnt, int flags)
1128 * unixes. Our API is identical to OSF/1 to avoid making a mess of AMD 1128 * unixes. Our API is identical to OSF/1 to avoid making a mess of AMD
1129 */ 1129 */
1130 1130
1131asmlinkage long sys_umount(char __user * name, int flags) 1131SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
1132{ 1132{
1133 struct path path; 1133 struct path path;
1134 int retval; 1134 int retval;
@@ -1160,7 +1160,7 @@ out:
1160/* 1160/*
1161 * The 2.0 compatible umount. No flags. 1161 * The 2.0 compatible umount. No flags.
1162 */ 1162 */
1163asmlinkage long sys_oldumount(char __user * name) 1163SYSCALL_DEFINE1(oldumount, char __user *, name)
1164{ 1164{
1165 return sys_umount(name, 0); 1165 return sys_umount(name, 0);
1166} 1166}
@@ -1990,7 +1990,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
1990 if (!new_ns->root) { 1990 if (!new_ns->root) {
1991 up_write(&namespace_sem); 1991 up_write(&namespace_sem);
1992 kfree(new_ns); 1992 kfree(new_ns);
1993 return ERR_PTR(-ENOMEM);; 1993 return ERR_PTR(-ENOMEM);
1994 } 1994 }
1995 spin_lock(&vfsmount_lock); 1995 spin_lock(&vfsmount_lock);
1996 list_add_tail(&new_ns->list, &new_ns->root->mnt_list); 1996 list_add_tail(&new_ns->list, &new_ns->root->mnt_list);
@@ -2045,9 +2045,8 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
2045 return new_ns; 2045 return new_ns;
2046} 2046}
2047 2047
2048asmlinkage long sys_mount(char __user * dev_name, char __user * dir_name, 2048SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
2049 char __user * type, unsigned long flags, 2049 char __user *, type, unsigned long, flags, void __user *, data)
2050 void __user * data)
2051{ 2050{
2052 int retval; 2051 int retval;
2053 unsigned long data_page; 2052 unsigned long data_page;
@@ -2172,8 +2171,8 @@ static void chroot_fs_refs(struct path *old_root, struct path *new_root)
2172 * though, so you may need to say mount --bind /nfs/my_root /nfs/my_root 2171 * though, so you may need to say mount --bind /nfs/my_root /nfs/my_root
2173 * first. 2172 * first.
2174 */ 2173 */
2175asmlinkage long sys_pivot_root(const char __user * new_root, 2174SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
2176 const char __user * put_old) 2175 const char __user *, put_old)
2177{ 2176{
2178 struct vfsmount *tmp; 2177 struct vfsmount *tmp;
2179 struct path new, old, parent_path, root_parent, root; 2178 struct path new, old, parent_path, root_parent, root;
diff --git a/fs/ncpfs/Kconfig b/fs/ncpfs/Kconfig
index 142808427b25..c931cf22a1f6 100644
--- a/fs/ncpfs/Kconfig
+++ b/fs/ncpfs/Kconfig
@@ -1,6 +1,27 @@
1# 1#
2# NCP Filesystem configuration 2# NCP Filesystem configuration
3# 3#
4config NCP_FS
5 tristate "NCP file system support (to mount NetWare volumes)"
6 depends on IPX!=n || INET
7 help
8 NCP (NetWare Core Protocol) is a protocol that runs over IPX and is
9 used by Novell NetWare clients to talk to file servers. It is to
10 IPX what NFS is to TCP/IP, if that helps. Saying Y here allows you
11 to mount NetWare file server volumes and to access them just like
12 any other Unix directory. For details, please read the file
13 <file:Documentation/filesystems/ncpfs.txt> in the kernel source and
14 the IPX-HOWTO from <http://www.tldp.org/docs.html#howto>.
15
16 You do not have to say Y here if you want your Linux box to act as a
17 file *server* for Novell NetWare clients.
18
19 General information about how to connect Linux, Windows machines and
20 Macs is on the WWW at <http://www.eats.com/linux_mac_win.html>.
21
22 To compile this as a module, choose M here: the module will be called
23 ncpfs. Say N unless you are connected to a Novell network.
24
4config NCPFS_PACKET_SIGNING 25config NCPFS_PACKET_SIGNING
5 bool "Packet signatures" 26 bool "Packet signatures"
6 depends on NCP_FS 27 depends on NCP_FS
diff --git a/fs/ncpfs/getopt.c b/fs/ncpfs/getopt.c
index 335b003dddf9..0af3349de851 100644
--- a/fs/ncpfs/getopt.c
+++ b/fs/ncpfs/getopt.c
@@ -16,7 +16,6 @@
16 * @opts: an array of &struct option entries controlling parser operations 16 * @opts: an array of &struct option entries controlling parser operations
17 * @optopt: output; will contain the current option 17 * @optopt: output; will contain the current option
18 * @optarg: output; will contain the value (if one exists) 18 * @optarg: output; will contain the value (if one exists)
19 * @flag: output; may be NULL; should point to a long for or'ing flags
20 * @value: output; may be NULL; will be overwritten with the integer value 19 * @value: output; may be NULL; will be overwritten with the integer value
21 * of the current argument. 20 * of the current argument.
22 * 21 *
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index 6d04e050c74e..f54360f50a9c 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -98,7 +98,7 @@ struct compat_ncp_objectname_ioctl
98{ 98{
99 s32 auth_type; 99 s32 auth_type;
100 u32 object_name_len; 100 u32 object_name_len;
101 compat_caddr_t object_name; /* an userspace data, in most cases user name */ 101 compat_caddr_t object_name; /* a userspace data, in most cases user name */
102}; 102};
103 103
104struct compat_ncp_fs_info_v2 { 104struct compat_ncp_fs_info_v2 {
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
new file mode 100644
index 000000000000..36fe20d6eba2
--- /dev/null
+++ b/fs/nfs/Kconfig
@@ -0,0 +1,86 @@
1config NFS_FS
2 tristate "NFS client support"
3 depends on INET
4 select LOCKD
5 select SUNRPC
6 select NFS_ACL_SUPPORT if NFS_V3_ACL
7 help
8 Choose Y here if you want to access files residing on other
9 computers using Sun's Network File System protocol. To compile
10 this file system support as a module, choose M here: the module
11 will be called nfs.
12
13 To mount file systems exported by NFS servers, you also need to
14 install the user space mount.nfs command which can be found in
15 the Linux nfs-utils package, available from http://linux-nfs.org/.
16 Information about using the mount command is available in the
17 mount(8) man page. More detail about the Linux NFS client
18 implementation is available via the nfs(5) man page.
19
20 Below you can choose which versions of the NFS protocol are
21 available in the kernel to mount NFS servers. Support for NFS
22 version 2 (RFC 1094) is always available when NFS_FS is selected.
23
24 To configure a system which mounts its root file system via NFS
25 at boot time, say Y here, select "Kernel level IP
26 autoconfiguration" in the NETWORK menu, and select "Root file
27 system on NFS" below. You cannot compile this file system as a
28 module in this case.
29
30 If unsure, say N.
31
32config NFS_V3
33 bool "NFS client support for NFS version 3"
34 depends on NFS_FS
35 help
36 This option enables support for version 3 of the NFS protocol
37 (RFC 1813) in the kernel's NFS client.
38
39 If unsure, say Y.
40
41config NFS_V3_ACL
42 bool "NFS client support for the NFSv3 ACL protocol extension"
43 depends on NFS_V3
44 help
45 Some NFS servers support an auxiliary NFSv3 ACL protocol that
46 Sun added to Solaris but never became an official part of the
47 NFS version 3 protocol. This protocol extension allows
48 applications on NFS clients to manipulate POSIX Access Control
49 Lists on files residing on NFS servers. NFS servers enforce
50 ACLs on local files whether this protocol is available or not.
51
52 Choose Y here if your NFS server supports the Solaris NFSv3 ACL
53 protocol extension and you want your NFS client to allow
54 applications to access and modify ACLs on files on the server.
55
56 Most NFS servers don't support the Solaris NFSv3 ACL protocol
57 extension. You can choose N here or specify the "noacl" mount
58 option to prevent your NFS client from trying to use the NFSv3
59 ACL protocol.
60
61 If unsure, say N.
62
63config NFS_V4
64 bool "NFS client support for NFS version 4 (EXPERIMENTAL)"
65 depends on NFS_FS && EXPERIMENTAL
66 select RPCSEC_GSS_KRB5
67 help
68 This option enables support for version 4 of the NFS protocol
69 (RFC 3530) in the kernel's NFS client.
70
71 To mount NFS servers using NFSv4, you also need to install user
72 space programs which can be found in the Linux nfs-utils package,
73 available from http://linux-nfs.org/.
74
75 If unsure, say N.
76
77config ROOT_NFS
78 bool "Root file system on NFS"
79 depends on NFS_FS=y && IP_PNP
80 help
81 If you want your system to mount its root file system via NFS,
82 choose Y here. This is common practice for managing systems
83 without local permanent storage. For details, read
84 <file:Documentation/filesystems/nfsroot.txt>.
85
86 Most people say N here.
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index d319b49f8f06..90f292b520d2 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -354,7 +354,7 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
354 file->f_path.dentry->d_name.name, 354 file->f_path.dentry->d_name.name,
355 mapping->host->i_ino, len, (long long) pos); 355 mapping->host->i_ino, len, (long long) pos);
356 356
357 page = __grab_cache_page(mapping, index); 357 page = grab_cache_page_write_begin(mapping, index, flags);
358 if (!page) 358 if (!page)
359 return -ENOMEM; 359 return -ENOMEM;
360 *pagep = page; 360 *pagep = page;
diff --git a/fs/nfsctl.c b/fs/nfsctl.c
index b1acbd6ab6fb..8f9a20556f79 100644
--- a/fs/nfsctl.c
+++ b/fs/nfsctl.c
@@ -38,9 +38,10 @@ static struct file *do_open(char *name, int flags)
38 return ERR_PTR(error); 38 return ERR_PTR(error);
39 39
40 if (flags == O_RDWR) 40 if (flags == O_RDWR)
41 error = may_open(&nd,MAY_READ|MAY_WRITE,FMODE_READ|FMODE_WRITE); 41 error = may_open(&nd.path, MAY_READ|MAY_WRITE,
42 FMODE_READ|FMODE_WRITE);
42 else 43 else
43 error = may_open(&nd, MAY_WRITE, FMODE_WRITE); 44 error = may_open(&nd.path, MAY_WRITE, FMODE_WRITE);
44 45
45 if (!error) 46 if (!error)
46 return dentry_open(nd.path.dentry, nd.path.mnt, flags, 47 return dentry_open(nd.path.dentry, nd.path.mnt, flags,
@@ -85,8 +86,8 @@ static struct {
85 }, 86 },
86}; 87};
87 88
88long 89SYSCALL_DEFINE3(nfsservctl, int, cmd, struct nfsctl_arg __user *, arg,
89asmlinkage sys_nfsservctl(int cmd, struct nfsctl_arg __user *arg, void __user *res) 90 void __user *, res)
90{ 91{
91 struct file *file; 92 struct file *file;
92 void __user *p = &arg->u; 93 void __user *p = &arg->u;
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
new file mode 100644
index 000000000000..44d7d04dab95
--- /dev/null
+++ b/fs/nfsd/Kconfig
@@ -0,0 +1,80 @@
1config NFSD
2 tristate "NFS server support"
3 depends on INET
4 select LOCKD
5 select SUNRPC
6 select EXPORTFS
7 select NFS_ACL_SUPPORT if NFSD_V2_ACL
8 help
9 Choose Y here if you want to allow other computers to access
10 files residing on this system using Sun's Network File System
11 protocol. To compile the NFS server support as a module,
12 choose M here: the module will be called nfsd.
13
14 You may choose to use a user-space NFS server instead, in which
15 case you can choose N here.
16
17 To export local file systems using NFS, you also need to install
18 user space programs which can be found in the Linux nfs-utils
19 package, available from http://linux-nfs.org/. More detail about
20 the Linux NFS server implementation is available via the
21 exports(5) man page.
22
23 Below you can choose which versions of the NFS protocol are
24 available to clients mounting the NFS server on this system.
25 Support for NFS version 2 (RFC 1094) is always available when
26 CONFIG_NFSD is selected.
27
28 If unsure, say N.
29
30config NFSD_V2_ACL
31 bool
32 depends on NFSD
33
34config NFSD_V3
35 bool "NFS server support for NFS version 3"
36 depends on NFSD
37 help
38 This option enables support in your system's NFS server for
39 version 3 of the NFS protocol (RFC 1813).
40
41 If unsure, say Y.
42
43config NFSD_V3_ACL
44 bool "NFS server support for the NFSv3 ACL protocol extension"
45 depends on NFSD_V3
46 select NFSD_V2_ACL
47 help
48 Solaris NFS servers support an auxiliary NFSv3 ACL protocol that
49 never became an official part of the NFS version 3 protocol.
50 This protocol extension allows applications on NFS clients to
51 manipulate POSIX Access Control Lists on files residing on NFS
52 servers. NFS servers enforce POSIX ACLs on local files whether
53 this protocol is available or not.
54
55 This option enables support in your system's NFS server for the
56 NFSv3 ACL protocol extension allowing NFS clients to manipulate
57 POSIX ACLs on files exported by your system's NFS server. NFS
58 clients which support the Solaris NFSv3 ACL protocol can then
59 access and modify ACLs on your NFS server.
60
61 To store ACLs on your NFS server, you also need to enable ACL-
62 related CONFIG options for your local file systems of choice.
63
64 If unsure, say N.
65
66config NFSD_V4
67 bool "NFS server support for NFS version 4 (EXPERIMENTAL)"
68 depends on NFSD && PROC_FS && EXPERIMENTAL
69 select NFSD_V3
70 select FS_POSIX_ACL
71 select RPCSEC_GSS_KRB5
72 help
73 This option enables support in your system's NFS server for
74 version 4 of the NFS protocol (RFC 3530).
75
76 To export files using NFSv4, you need to install additional user
77 space programs which can be found in the Linux nfs-utils package,
78 available from http://linux-nfs.org/.
79
80 If unsure, say N.
diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index 0184fe9b514c..5573508f707f 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -49,6 +49,8 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
49 new->fsuid = exp->ex_anon_uid; 49 new->fsuid = exp->ex_anon_uid;
50 new->fsgid = exp->ex_anon_gid; 50 new->fsgid = exp->ex_anon_gid;
51 gi = groups_alloc(0); 51 gi = groups_alloc(0);
52 if (!gi)
53 goto oom;
52 } else if (flags & NFSEXP_ROOTSQUASH) { 54 } else if (flags & NFSEXP_ROOTSQUASH) {
53 if (!new->fsuid) 55 if (!new->fsuid)
54 new->fsuid = exp->ex_anon_uid; 56 new->fsuid = exp->ex_anon_uid;
@@ -76,15 +78,16 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
76 78
77 ret = set_groups(new, gi); 79 ret = set_groups(new, gi);
78 put_group_info(gi); 80 put_group_info(gi);
79 if (!ret) 81 if (ret < 0)
80 goto error; 82 goto error;
81 83
82 if (new->uid) 84 if (new->fsuid)
83 new->cap_effective = cap_drop_nfsd_set(new->cap_effective); 85 new->cap_effective = cap_drop_nfsd_set(new->cap_effective);
84 else 86 else
85 new->cap_effective = cap_raise_nfsd_set(new->cap_effective, 87 new->cap_effective = cap_raise_nfsd_set(new->cap_effective,
86 new->cap_permitted); 88 new->cap_permitted);
87 put_cred(override_creds(new)); 89 put_cred(override_creds(new));
90 put_cred(new);
88 return 0; 91 return 0;
89 92
90oom: 93oom:
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 6d7d8c02c197..c464181b5994 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -53,9 +53,6 @@
53#define NFSPROC4_CB_NULL 0 53#define NFSPROC4_CB_NULL 0
54#define NFSPROC4_CB_COMPOUND 1 54#define NFSPROC4_CB_COMPOUND 1
55 55
56/* declarations */
57static const struct rpc_call_ops nfs4_cb_null_ops;
58
59/* Index of predefined Linux callback client operations */ 56/* Index of predefined Linux callback client operations */
60 57
61enum { 58enum {
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 669461e291ae..9fa60a3ad48c 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -946,6 +946,11 @@ encode_op:
946 nfsd4_encode_operation(resp, op); 946 nfsd4_encode_operation(resp, op);
947 status = op->status; 947 status = op->status;
948 } 948 }
949
950 dprintk("nfsv4 compound op %p opcnt %d #%d: %d: status %d\n",
951 args->ops, args->opcnt, resp->opcnt, op->opnum,
952 be32_to_cpu(status));
953
949 if (cstate->replay_owner) { 954 if (cstate->replay_owner) {
950 nfs4_put_stateowner(cstate->replay_owner); 955 nfs4_put_stateowner(cstate->replay_owner);
951 cstate->replay_owner = NULL; 956 cstate->replay_owner = NULL;
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 0f9d6efaa62b..74f7b67567fd 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -116,9 +116,9 @@ nfs4_make_rec_clidname(char *dname, struct xdr_netobj *clname)
116 116
117 md5_to_hex(dname, cksum.data); 117 md5_to_hex(dname, cksum.data);
118 118
119 kfree(cksum.data);
120 status = nfs_ok; 119 status = nfs_ok;
121out: 120out:
121 kfree(cksum.data);
122 crypto_free_hash(desc.tfm); 122 crypto_free_hash(desc.tfm);
123out_no_tfm: 123out_no_tfm:
124 return status; 124 return status;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 13e0e074dbb8..b6f60f48e94b 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2416,6 +2416,26 @@ out:
2416#define LOCK_HASH_SIZE (1 << LOCK_HASH_BITS) 2416#define LOCK_HASH_SIZE (1 << LOCK_HASH_BITS)
2417#define LOCK_HASH_MASK (LOCK_HASH_SIZE - 1) 2417#define LOCK_HASH_MASK (LOCK_HASH_SIZE - 1)
2418 2418
2419static inline u64
2420end_offset(u64 start, u64 len)
2421{
2422 u64 end;
2423
2424 end = start + len;
2425 return end >= start ? end: NFS4_MAX_UINT64;
2426}
2427
2428/* last octet in a range */
2429static inline u64
2430last_byte_offset(u64 start, u64 len)
2431{
2432 u64 end;
2433
2434 BUG_ON(!len);
2435 end = start + len;
2436 return end > start ? end - 1: NFS4_MAX_UINT64;
2437}
2438
2419#define lockownerid_hashval(id) \ 2439#define lockownerid_hashval(id) \
2420 ((id) & LOCK_HASH_MASK) 2440 ((id) & LOCK_HASH_MASK)
2421 2441
@@ -2435,13 +2455,13 @@ static struct list_head lockstateid_hashtbl[STATEID_HASH_SIZE];
2435static struct nfs4_stateid * 2455static struct nfs4_stateid *
2436find_stateid(stateid_t *stid, int flags) 2456find_stateid(stateid_t *stid, int flags)
2437{ 2457{
2438 struct nfs4_stateid *local = NULL; 2458 struct nfs4_stateid *local;
2439 u32 st_id = stid->si_stateownerid; 2459 u32 st_id = stid->si_stateownerid;
2440 u32 f_id = stid->si_fileid; 2460 u32 f_id = stid->si_fileid;
2441 unsigned int hashval; 2461 unsigned int hashval;
2442 2462
2443 dprintk("NFSD: find_stateid flags 0x%x\n",flags); 2463 dprintk("NFSD: find_stateid flags 0x%x\n",flags);
2444 if ((flags & LOCK_STATE) || (flags & RD_STATE) || (flags & WR_STATE)) { 2464 if (flags & (LOCK_STATE | RD_STATE | WR_STATE)) {
2445 hashval = stateid_hashval(st_id, f_id); 2465 hashval = stateid_hashval(st_id, f_id);
2446 list_for_each_entry(local, &lockstateid_hashtbl[hashval], st_hash) { 2466 list_for_each_entry(local, &lockstateid_hashtbl[hashval], st_hash) {
2447 if ((local->st_stateid.si_stateownerid == st_id) && 2467 if ((local->st_stateid.si_stateownerid == st_id) &&
@@ -2449,7 +2469,8 @@ find_stateid(stateid_t *stid, int flags)
2449 return local; 2469 return local;
2450 } 2470 }
2451 } 2471 }
2452 if ((flags & OPEN_STATE) || (flags & RD_STATE) || (flags & WR_STATE)) { 2472
2473 if (flags & (OPEN_STATE | RD_STATE | WR_STATE)) {
2453 hashval = stateid_hashval(st_id, f_id); 2474 hashval = stateid_hashval(st_id, f_id);
2454 list_for_each_entry(local, &stateid_hashtbl[hashval], st_hash) { 2475 list_for_each_entry(local, &stateid_hashtbl[hashval], st_hash) {
2455 if ((local->st_stateid.si_stateownerid == st_id) && 2476 if ((local->st_stateid.si_stateownerid == st_id) &&
@@ -2518,8 +2539,8 @@ nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny)
2518 deny->ld_clientid.cl_id = 0; 2539 deny->ld_clientid.cl_id = 0;
2519 } 2540 }
2520 deny->ld_start = fl->fl_start; 2541 deny->ld_start = fl->fl_start;
2521 deny->ld_length = ~(u64)0; 2542 deny->ld_length = NFS4_MAX_UINT64;
2522 if (fl->fl_end != ~(u64)0) 2543 if (fl->fl_end != NFS4_MAX_UINT64)
2523 deny->ld_length = fl->fl_end - fl->fl_start + 1; 2544 deny->ld_length = fl->fl_end - fl->fl_start + 1;
2524 deny->ld_type = NFS4_READ_LT; 2545 deny->ld_type = NFS4_READ_LT;
2525 if (fl->fl_type != F_RDLCK) 2546 if (fl->fl_type != F_RDLCK)
@@ -2616,7 +2637,7 @@ out:
2616static int 2637static int
2617check_lock_length(u64 offset, u64 length) 2638check_lock_length(u64 offset, u64 length)
2618{ 2639{
2619 return ((length == 0) || ((length != ~(u64)0) && 2640 return ((length == 0) || ((length != NFS4_MAX_UINT64) &&
2620 LOFF_OVERFLOW(offset, length))); 2641 LOFF_OVERFLOW(offset, length)));
2621} 2642}
2622 2643
@@ -2736,11 +2757,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2736 file_lock.fl_lmops = &nfsd_posix_mng_ops; 2757 file_lock.fl_lmops = &nfsd_posix_mng_ops;
2737 2758
2738 file_lock.fl_start = lock->lk_offset; 2759 file_lock.fl_start = lock->lk_offset;
2739 if ((lock->lk_length == ~(u64)0) || 2760 file_lock.fl_end = last_byte_offset(lock->lk_offset, lock->lk_length);
2740 LOFF_OVERFLOW(lock->lk_offset, lock->lk_length))
2741 file_lock.fl_end = ~(u64)0;
2742 else
2743 file_lock.fl_end = lock->lk_offset + lock->lk_length - 1;
2744 nfs4_transform_lock_offset(&file_lock); 2761 nfs4_transform_lock_offset(&file_lock);
2745 2762
2746 /* 2763 /*
@@ -2781,6 +2798,25 @@ out:
2781} 2798}
2782 2799
2783/* 2800/*
2801 * The NFSv4 spec allows a client to do a LOCKT without holding an OPEN,
2802 * so we do a temporary open here just to get an open file to pass to
2803 * vfs_test_lock. (Arguably perhaps test_lock should be done with an
2804 * inode operation.)
2805 */
2806static int nfsd_test_lock(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file_lock *lock)
2807{
2808 struct file *file;
2809 int err;
2810
2811 err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file);
2812 if (err)
2813 return err;
2814 err = vfs_test_lock(file, lock);
2815 nfsd_close(file);
2816 return err;
2817}
2818
2819/*
2784 * LOCKT operation 2820 * LOCKT operation
2785 */ 2821 */
2786__be32 2822__be32
@@ -2788,7 +2824,6 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2788 struct nfsd4_lockt *lockt) 2824 struct nfsd4_lockt *lockt)
2789{ 2825{
2790 struct inode *inode; 2826 struct inode *inode;
2791 struct file file;
2792 struct file_lock file_lock; 2827 struct file_lock file_lock;
2793 int error; 2828 int error;
2794 __be32 status; 2829 __be32 status;
@@ -2836,26 +2871,14 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2836 file_lock.fl_owner = (fl_owner_t)lockt->lt_stateowner; 2871 file_lock.fl_owner = (fl_owner_t)lockt->lt_stateowner;
2837 file_lock.fl_pid = current->tgid; 2872 file_lock.fl_pid = current->tgid;
2838 file_lock.fl_flags = FL_POSIX; 2873 file_lock.fl_flags = FL_POSIX;
2839 file_lock.fl_lmops = &nfsd_posix_mng_ops;
2840 2874
2841 file_lock.fl_start = lockt->lt_offset; 2875 file_lock.fl_start = lockt->lt_offset;
2842 if ((lockt->lt_length == ~(u64)0) || LOFF_OVERFLOW(lockt->lt_offset, lockt->lt_length)) 2876 file_lock.fl_end = last_byte_offset(lockt->lt_offset, lockt->lt_length);
2843 file_lock.fl_end = ~(u64)0;
2844 else
2845 file_lock.fl_end = lockt->lt_offset + lockt->lt_length - 1;
2846 2877
2847 nfs4_transform_lock_offset(&file_lock); 2878 nfs4_transform_lock_offset(&file_lock);
2848 2879
2849 /* vfs_test_lock uses the struct file _only_ to resolve the inode.
2850 * since LOCKT doesn't require an OPEN, and therefore a struct
2851 * file may not exist, pass vfs_test_lock a struct file with
2852 * only the dentry:inode set.
2853 */
2854 memset(&file, 0, sizeof (struct file));
2855 file.f_path.dentry = cstate->current_fh.fh_dentry;
2856
2857 status = nfs_ok; 2880 status = nfs_ok;
2858 error = vfs_test_lock(&file, &file_lock); 2881 error = nfsd_test_lock(rqstp, &cstate->current_fh, &file_lock);
2859 if (error) { 2882 if (error) {
2860 status = nfserrno(error); 2883 status = nfserrno(error);
2861 goto out; 2884 goto out;
@@ -2906,10 +2929,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2906 file_lock.fl_lmops = &nfsd_posix_mng_ops; 2929 file_lock.fl_lmops = &nfsd_posix_mng_ops;
2907 file_lock.fl_start = locku->lu_offset; 2930 file_lock.fl_start = locku->lu_offset;
2908 2931
2909 if ((locku->lu_length == ~(u64)0) || LOFF_OVERFLOW(locku->lu_offset, locku->lu_length)) 2932 file_lock.fl_end = last_byte_offset(locku->lu_offset, locku->lu_length);
2910 file_lock.fl_end = ~(u64)0;
2911 else
2912 file_lock.fl_end = locku->lu_offset + locku->lu_length - 1;
2913 nfs4_transform_lock_offset(&file_lock); 2933 nfs4_transform_lock_offset(&file_lock);
2914 2934
2915 /* 2935 /*
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index afcdf4b76843..f65953be39c0 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1,6 +1,4 @@
1/* 1/*
2 * fs/nfs/nfs4xdr.c
3 *
4 * Server-side XDR for NFSv4 2 * Server-side XDR for NFSv4
5 * 3 *
6 * Copyright (c) 2002 The Regents of the University of Michigan. 4 * Copyright (c) 2002 The Regents of the University of Michigan.
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 77d7b8c531a6..3d93b2064ce5 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -84,6 +84,8 @@ static ssize_t write_unexport(struct file *file, char *buf, size_t size);
84static ssize_t write_getfd(struct file *file, char *buf, size_t size); 84static ssize_t write_getfd(struct file *file, char *buf, size_t size);
85static ssize_t write_getfs(struct file *file, char *buf, size_t size); 85static ssize_t write_getfs(struct file *file, char *buf, size_t size);
86static ssize_t write_filehandle(struct file *file, char *buf, size_t size); 86static ssize_t write_filehandle(struct file *file, char *buf, size_t size);
87static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size);
88static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size);
87static ssize_t write_threads(struct file *file, char *buf, size_t size); 89static ssize_t write_threads(struct file *file, char *buf, size_t size);
88static ssize_t write_pool_threads(struct file *file, char *buf, size_t size); 90static ssize_t write_pool_threads(struct file *file, char *buf, size_t size);
89static ssize_t write_versions(struct file *file, char *buf, size_t size); 91static ssize_t write_versions(struct file *file, char *buf, size_t size);
@@ -94,9 +96,6 @@ static ssize_t write_leasetime(struct file *file, char *buf, size_t size);
94static ssize_t write_recoverydir(struct file *file, char *buf, size_t size); 96static ssize_t write_recoverydir(struct file *file, char *buf, size_t size);
95#endif 97#endif
96 98
97static ssize_t failover_unlock_ip(struct file *file, char *buf, size_t size);
98static ssize_t failover_unlock_fs(struct file *file, char *buf, size_t size);
99
100static ssize_t (*write_op[])(struct file *, char *, size_t) = { 99static ssize_t (*write_op[])(struct file *, char *, size_t) = {
101 [NFSD_Svc] = write_svc, 100 [NFSD_Svc] = write_svc,
102 [NFSD_Add] = write_add, 101 [NFSD_Add] = write_add,
@@ -106,8 +105,8 @@ static ssize_t (*write_op[])(struct file *, char *, size_t) = {
106 [NFSD_Getfd] = write_getfd, 105 [NFSD_Getfd] = write_getfd,
107 [NFSD_Getfs] = write_getfs, 106 [NFSD_Getfs] = write_getfs,
108 [NFSD_Fh] = write_filehandle, 107 [NFSD_Fh] = write_filehandle,
109 [NFSD_FO_UnlockIP] = failover_unlock_ip, 108 [NFSD_FO_UnlockIP] = write_unlock_ip,
110 [NFSD_FO_UnlockFS] = failover_unlock_fs, 109 [NFSD_FO_UnlockFS] = write_unlock_fs,
111 [NFSD_Threads] = write_threads, 110 [NFSD_Threads] = write_threads,
112 [NFSD_Pool_Threads] = write_pool_threads, 111 [NFSD_Pool_Threads] = write_pool_threads,
113 [NFSD_Versions] = write_versions, 112 [NFSD_Versions] = write_versions,
@@ -176,10 +175,24 @@ static const struct file_operations exports_operations = {
176/*----------------------------------------------------------------------------*/ 175/*----------------------------------------------------------------------------*/
177/* 176/*
178 * payload - write methods 177 * payload - write methods
179 * If the method has a response, the response should be put in buf,
180 * and the length returned. Otherwise return 0 or and -error.
181 */ 178 */
182 179
180/**
181 * write_svc - Start kernel's NFSD server
182 *
183 * Deprecated. /proc/fs/nfsd/threads is preferred.
184 * Function remains to support old versions of nfs-utils.
185 *
186 * Input:
187 * buf: struct nfsctl_svc
188 * svc_port: port number of this
189 * server's listener
190 * svc_nthreads: number of threads to start
191 * size: size in bytes of passed in nfsctl_svc
192 * Output:
193 * On success: returns zero
194 * On error: return code is negative errno value
195 */
183static ssize_t write_svc(struct file *file, char *buf, size_t size) 196static ssize_t write_svc(struct file *file, char *buf, size_t size)
184{ 197{
185 struct nfsctl_svc *data; 198 struct nfsctl_svc *data;
@@ -189,6 +202,30 @@ static ssize_t write_svc(struct file *file, char *buf, size_t size)
189 return nfsd_svc(data->svc_port, data->svc_nthreads); 202 return nfsd_svc(data->svc_port, data->svc_nthreads);
190} 203}
191 204
205/**
206 * write_add - Add or modify client entry in auth unix cache
207 *
208 * Deprecated. /proc/net/rpc/auth.unix.ip is preferred.
209 * Function remains to support old versions of nfs-utils.
210 *
211 * Input:
212 * buf: struct nfsctl_client
213 * cl_ident: '\0'-terminated C string
214 * containing domain name
215 * of client
216 * cl_naddr: no. of items in cl_addrlist
217 * cl_addrlist: array of client addresses
218 * cl_fhkeytype: ignored
219 * cl_fhkeylen: ignored
220 * cl_fhkey: ignored
221 * size: size in bytes of passed in nfsctl_client
222 * Output:
223 * On success: returns zero
224 * On error: return code is negative errno value
225 *
226 * Note: Only AF_INET client addresses are passed in, since
227 * nfsctl_client.cl_addrlist contains only in_addr fields for addresses.
228 */
192static ssize_t write_add(struct file *file, char *buf, size_t size) 229static ssize_t write_add(struct file *file, char *buf, size_t size)
193{ 230{
194 struct nfsctl_client *data; 231 struct nfsctl_client *data;
@@ -198,6 +235,30 @@ static ssize_t write_add(struct file *file, char *buf, size_t size)
198 return exp_addclient(data); 235 return exp_addclient(data);
199} 236}
200 237
238/**
239 * write_del - Remove client from auth unix cache
240 *
241 * Deprecated. /proc/net/rpc/auth.unix.ip is preferred.
242 * Function remains to support old versions of nfs-utils.
243 *
244 * Input:
245 * buf: struct nfsctl_client
246 * cl_ident: '\0'-terminated C string
247 * containing domain name
248 * of client
249 * cl_naddr: ignored
250 * cl_addrlist: ignored
251 * cl_fhkeytype: ignored
252 * cl_fhkeylen: ignored
253 * cl_fhkey: ignored
254 * size: size in bytes of passed in nfsctl_client
255 * Output:
256 * On success: returns zero
257 * On error: return code is negative errno value
258 *
259 * Note: Only AF_INET client addresses are passed in, since
260 * nfsctl_client.cl_addrlist contains only in_addr fields for addresses.
261 */
201static ssize_t write_del(struct file *file, char *buf, size_t size) 262static ssize_t write_del(struct file *file, char *buf, size_t size)
202{ 263{
203 struct nfsctl_client *data; 264 struct nfsctl_client *data;
@@ -207,6 +268,33 @@ static ssize_t write_del(struct file *file, char *buf, size_t size)
207 return exp_delclient(data); 268 return exp_delclient(data);
208} 269}
209 270
271/**
272 * write_export - Export part or all of a local file system
273 *
274 * Deprecated. /proc/net/rpc/{nfsd.export,nfsd.fh} are preferred.
275 * Function remains to support old versions of nfs-utils.
276 *
277 * Input:
278 * buf: struct nfsctl_export
279 * ex_client: '\0'-terminated C string
280 * containing domain name
281 * of client allowed to access
282 * this export
283 * ex_path: '\0'-terminated C string
284 * containing pathname of
285 * directory in local file system
286 * ex_dev: fsid to use for this export
287 * ex_ino: ignored
288 * ex_flags: export flags for this export
289 * ex_anon_uid: UID to use for anonymous
290 * requests
291 * ex_anon_gid: GID to use for anonymous
292 * requests
293 * size: size in bytes of passed in nfsctl_export
294 * Output:
295 * On success: returns zero
296 * On error: return code is negative errno value
297 */
210static ssize_t write_export(struct file *file, char *buf, size_t size) 298static ssize_t write_export(struct file *file, char *buf, size_t size)
211{ 299{
212 struct nfsctl_export *data; 300 struct nfsctl_export *data;
@@ -216,6 +304,31 @@ static ssize_t write_export(struct file *file, char *buf, size_t size)
216 return exp_export(data); 304 return exp_export(data);
217} 305}
218 306
307/**
308 * write_unexport - Unexport a previously exported file system
309 *
310 * Deprecated. /proc/net/rpc/{nfsd.export,nfsd.fh} are preferred.
311 * Function remains to support old versions of nfs-utils.
312 *
313 * Input:
314 * buf: struct nfsctl_export
315 * ex_client: '\0'-terminated C string
316 * containing domain name
317 * of client no longer allowed
318 * to access this export
319 * ex_path: '\0'-terminated C string
320 * containing pathname of
321 * directory in local file system
322 * ex_dev: ignored
323 * ex_ino: ignored
324 * ex_flags: ignored
325 * ex_anon_uid: ignored
326 * ex_anon_gid: ignored
327 * size: size in bytes of passed in nfsctl_export
328 * Output:
329 * On success: returns zero
330 * On error: return code is negative errno value
331 */
219static ssize_t write_unexport(struct file *file, char *buf, size_t size) 332static ssize_t write_unexport(struct file *file, char *buf, size_t size)
220{ 333{
221 struct nfsctl_export *data; 334 struct nfsctl_export *data;
@@ -226,6 +339,30 @@ static ssize_t write_unexport(struct file *file, char *buf, size_t size)
226 return exp_unexport(data); 339 return exp_unexport(data);
227} 340}
228 341
342/**
343 * write_getfs - Get a variable-length NFS file handle by path
344 *
345 * Deprecated. /proc/fs/nfsd/filehandle is preferred.
346 * Function remains to support old versions of nfs-utils.
347 *
348 * Input:
349 * buf: struct nfsctl_fsparm
350 * gd_addr: socket address of client
351 * gd_path: '\0'-terminated C string
352 * containing pathname of
353 * directory in local file system
354 * gd_maxlen: maximum size of returned file
355 * handle
356 * size: size in bytes of passed in nfsctl_fsparm
357 * Output:
358 * On success: passed-in buffer filled with a knfsd_fh structure
359 * (a variable-length raw NFS file handle);
360 * return code is the size in bytes of the file handle
361 * On error: return code is negative errno value
362 *
363 * Note: Only AF_INET client addresses are passed in, since gd_addr
364 * is the same size as a struct sockaddr_in.
365 */
229static ssize_t write_getfs(struct file *file, char *buf, size_t size) 366static ssize_t write_getfs(struct file *file, char *buf, size_t size)
230{ 367{
231 struct nfsctl_fsparm *data; 368 struct nfsctl_fsparm *data;
@@ -265,6 +402,29 @@ static ssize_t write_getfs(struct file *file, char *buf, size_t size)
265 return err; 402 return err;
266} 403}
267 404
405/**
406 * write_getfd - Get a fixed-length NFS file handle by path (used by mountd)
407 *
408 * Deprecated. /proc/fs/nfsd/filehandle is preferred.
409 * Function remains to support old versions of nfs-utils.
410 *
411 * Input:
412 * buf: struct nfsctl_fdparm
413 * gd_addr: socket address of client
414 * gd_path: '\0'-terminated C string
415 * containing pathname of
416 * directory in local file system
417 * gd_version: fdparm structure version
418 * size: size in bytes of passed in nfsctl_fdparm
419 * Output:
420 * On success: passed-in buffer filled with nfsctl_res
421 * (a fixed-length raw NFS file handle);
422 * return code is the size in bytes of the file handle
423 * On error: return code is negative errno value
424 *
425 * Note: Only AF_INET client addresses are passed in, since gd_addr
426 * is the same size as a struct sockaddr_in.
427 */
268static ssize_t write_getfd(struct file *file, char *buf, size_t size) 428static ssize_t write_getfd(struct file *file, char *buf, size_t size)
269{ 429{
270 struct nfsctl_fdparm *data; 430 struct nfsctl_fdparm *data;
@@ -309,7 +469,23 @@ static ssize_t write_getfd(struct file *file, char *buf, size_t size)
309 return err; 469 return err;
310} 470}
311 471
312static ssize_t failover_unlock_ip(struct file *file, char *buf, size_t size) 472/**
473 * write_unlock_ip - Release all locks used by a client
474 *
475 * Experimental.
476 *
477 * Input:
478 * buf: '\n'-terminated C string containing a
479 * presentation format IPv4 address
480 * size: length of C string in @buf
481 * Output:
482 * On success: returns zero if all specified locks were released;
483 * returns one if one or more locks were not released
484 * On error: return code is negative errno value
485 *
486 * Note: Only AF_INET client addresses are passed in
487 */
488static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size)
313{ 489{
314 struct sockaddr_in sin = { 490 struct sockaddr_in sin = {
315 .sin_family = AF_INET, 491 .sin_family = AF_INET,
@@ -339,7 +515,21 @@ static ssize_t failover_unlock_ip(struct file *file, char *buf, size_t size)
339 return nlmsvc_unlock_all_by_ip((struct sockaddr *)&sin); 515 return nlmsvc_unlock_all_by_ip((struct sockaddr *)&sin);
340} 516}
341 517
342static ssize_t failover_unlock_fs(struct file *file, char *buf, size_t size) 518/**
519 * write_unlock_fs - Release all locks on a local file system
520 *
521 * Experimental.
522 *
523 * Input:
524 * buf: '\n'-terminated C string containing the
525 * absolute pathname of a local file system
526 * size: length of C string in @buf
527 * Output:
528 * On success: returns zero if all specified locks were released;
529 * returns one if one or more locks were not released
530 * On error: return code is negative errno value
531 */
532static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size)
343{ 533{
344 struct path path; 534 struct path path;
345 char *fo_path; 535 char *fo_path;
@@ -360,21 +550,44 @@ static ssize_t failover_unlock_fs(struct file *file, char *buf, size_t size)
360 if (error) 550 if (error)
361 return error; 551 return error;
362 552
553 /*
554 * XXX: Needs better sanity checking. Otherwise we could end up
555 * releasing locks on the wrong file system.
556 *
557 * For example:
558 * 1. Does the path refer to a directory?
559 * 2. Is that directory a mount point, or
560 * 3. Is that directory the root of an exported file system?
561 */
363 error = nlmsvc_unlock_all_by_sb(path.mnt->mnt_sb); 562 error = nlmsvc_unlock_all_by_sb(path.mnt->mnt_sb);
364 563
365 path_put(&path); 564 path_put(&path);
366 return error; 565 return error;
367} 566}
368 567
568/**
569 * write_filehandle - Get a variable-length NFS file handle by path
570 *
571 * On input, the buffer contains a '\n'-terminated C string comprised of
572 * three alphanumeric words separated by whitespace. The string may
573 * contain escape sequences.
574 *
575 * Input:
576 * buf:
577 * domain: client domain name
578 * path: export pathname
579 * maxsize: numeric maximum size of
580 * @buf
581 * size: length of C string in @buf
582 * Output:
583 * On success: passed-in buffer filled with '\n'-terminated C
584 * string containing a ASCII hex text version
585 * of the NFS file handle;
586 * return code is the size in bytes of the string
587 * On error: return code is negative errno value
588 */
369static ssize_t write_filehandle(struct file *file, char *buf, size_t size) 589static ssize_t write_filehandle(struct file *file, char *buf, size_t size)
370{ 590{
371 /* request is:
372 * domain path maxsize
373 * response is
374 * filehandle
375 *
376 * qword quoting is used, so filehandle will be \x....
377 */
378 char *dname, *path; 591 char *dname, *path;
379 int uninitialized_var(maxsize); 592 int uninitialized_var(maxsize);
380 char *mesg = buf; 593 char *mesg = buf;
@@ -391,11 +604,13 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size)
391 604
392 dname = mesg; 605 dname = mesg;
393 len = qword_get(&mesg, dname, size); 606 len = qword_get(&mesg, dname, size);
394 if (len <= 0) return -EINVAL; 607 if (len <= 0)
608 return -EINVAL;
395 609
396 path = dname+len+1; 610 path = dname+len+1;
397 len = qword_get(&mesg, path, size); 611 len = qword_get(&mesg, path, size);
398 if (len <= 0) return -EINVAL; 612 if (len <= 0)
613 return -EINVAL;
399 614
400 len = get_int(&mesg, &maxsize); 615 len = get_int(&mesg, &maxsize);
401 if (len) 616 if (len)
@@ -419,17 +634,43 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size)
419 if (len) 634 if (len)
420 return len; 635 return len;
421 636
422 mesg = buf; len = SIMPLE_TRANSACTION_LIMIT; 637 mesg = buf;
638 len = SIMPLE_TRANSACTION_LIMIT;
423 qword_addhex(&mesg, &len, (char*)&fh.fh_base, fh.fh_size); 639 qword_addhex(&mesg, &len, (char*)&fh.fh_base, fh.fh_size);
424 mesg[-1] = '\n'; 640 mesg[-1] = '\n';
425 return mesg - buf; 641 return mesg - buf;
426} 642}
427 643
644/**
645 * write_threads - Start NFSD, or report the current number of running threads
646 *
647 * Input:
648 * buf: ignored
649 * size: zero
650 * Output:
651 * On success: passed-in buffer filled with '\n'-terminated C
652 * string numeric value representing the number of
653 * running NFSD threads;
654 * return code is the size in bytes of the string
655 * On error: return code is zero
656 *
657 * OR
658 *
659 * Input:
660 * buf: C string containing an unsigned
661 * integer value representing the
662 * number of NFSD threads to start
663 * size: non-zero length of C string in @buf
664 * Output:
665 * On success: NFS service is started;
666 * passed-in buffer filled with '\n'-terminated C
667 * string numeric value representing the number of
668 * running NFSD threads;
669 * return code is the size in bytes of the string
670 * On error: return code is zero or a negative errno value
671 */
428static ssize_t write_threads(struct file *file, char *buf, size_t size) 672static ssize_t write_threads(struct file *file, char *buf, size_t size)
429{ 673{
430 /* if size > 0, look for a number of threads and call nfsd_svc
431 * then write out number of threads as reply
432 */
433 char *mesg = buf; 674 char *mesg = buf;
434 int rv; 675 int rv;
435 if (size > 0) { 676 if (size > 0) {
@@ -437,9 +678,9 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size)
437 rv = get_int(&mesg, &newthreads); 678 rv = get_int(&mesg, &newthreads);
438 if (rv) 679 if (rv)
439 return rv; 680 return rv;
440 if (newthreads <0) 681 if (newthreads < 0)
441 return -EINVAL; 682 return -EINVAL;
442 rv = nfsd_svc(2049, newthreads); 683 rv = nfsd_svc(NFS_PORT, newthreads);
443 if (rv) 684 if (rv)
444 return rv; 685 return rv;
445 } 686 }
@@ -447,6 +688,28 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size)
447 return strlen(buf); 688 return strlen(buf);
448} 689}
449 690
691/**
692 * write_pool_threads - Set or report the current number of threads per pool
693 *
694 * Input:
695 * buf: ignored
696 * size: zero
697 *
698 * OR
699 *
700 * Input:
701 * buf: C string containing whitespace-
702 * separated unsigned integer values
703 * representing the number of NFSD
704 * threads to start in each pool
705 * size: non-zero length of C string in @buf
706 * Output:
707 * On success: passed-in buffer filled with '\n'-terminated C
708 * string containing integer values representing the
709 * number of NFSD threads in each pool;
710 * return code is the size in bytes of the string
711 * On error: return code is zero or a negative errno value
712 */
450static ssize_t write_pool_threads(struct file *file, char *buf, size_t size) 713static ssize_t write_pool_threads(struct file *file, char *buf, size_t size)
451{ 714{
452 /* if size > 0, look for an array of number of threads per node 715 /* if size > 0, look for an array of number of threads per node
@@ -517,10 +780,6 @@ out_free:
517 780
518static ssize_t __write_versions(struct file *file, char *buf, size_t size) 781static ssize_t __write_versions(struct file *file, char *buf, size_t size)
519{ 782{
520 /*
521 * Format:
522 * [-/+]vers [-/+]vers ...
523 */
524 char *mesg = buf; 783 char *mesg = buf;
525 char *vers, sign; 784 char *vers, sign;
526 int len, num; 785 int len, num;
@@ -578,6 +837,38 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
578 return len; 837 return len;
579} 838}
580 839
840/**
841 * write_versions - Set or report the available NFS protocol versions
842 *
843 * Input:
844 * buf: ignored
845 * size: zero
846 * Output:
847 * On success: passed-in buffer filled with '\n'-terminated C
848 * string containing positive or negative integer
849 * values representing the current status of each
850 * protocol version;
851 * return code is the size in bytes of the string
852 * On error: return code is zero or a negative errno value
853 *
854 * OR
855 *
856 * Input:
857 * buf: C string containing whitespace-
858 * separated positive or negative
859 * integer values representing NFS
860 * protocol versions to enable ("+n")
861 * or disable ("-n")
862 * size: non-zero length of C string in @buf
863 * Output:
864 * On success: status of zero or more protocol versions has
865 * been updated; passed-in buffer filled with
866 * '\n'-terminated C string containing positive
867 * or negative integer values representing the
868 * current status of each protocol version;
869 * return code is the size in bytes of the string
870 * On error: return code is zero or a negative errno value
871 */
581static ssize_t write_versions(struct file *file, char *buf, size_t size) 872static ssize_t write_versions(struct file *file, char *buf, size_t size)
582{ 873{
583 ssize_t rv; 874 ssize_t rv;
@@ -687,6 +978,75 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size)
687 return -EINVAL; 978 return -EINVAL;
688} 979}
689 980
981/**
982 * write_ports - Pass a socket file descriptor or transport name to listen on
983 *
984 * Input:
985 * buf: ignored
986 * size: zero
987 * Output:
988 * On success: passed-in buffer filled with a '\n'-terminated C
989 * string containing a whitespace-separated list of
990 * named NFSD listeners;
991 * return code is the size in bytes of the string
992 * On error: return code is zero or a negative errno value
993 *
994 * OR
995 *
996 * Input:
997 * buf: C string containing an unsigned
998 * integer value representing a bound
999 * but unconnected socket that is to be
1000 * used as an NFSD listener
1001 * size: non-zero length of C string in @buf
1002 * Output:
1003 * On success: NFS service is started;
1004 * passed-in buffer filled with a '\n'-terminated C
1005 * string containing a unique alphanumeric name of
1006 * the listener;
1007 * return code is the size in bytes of the string
1008 * On error: return code is a negative errno value
1009 *
1010 * OR
1011 *
1012 * Input:
1013 * buf: C string containing a "-" followed
1014 * by an integer value representing a
1015 * previously passed in socket file
1016 * descriptor
1017 * size: non-zero length of C string in @buf
1018 * Output:
1019 * On success: NFS service no longer listens on that socket;
1020 * passed-in buffer filled with a '\n'-terminated C
1021 * string containing a unique name of the listener;
1022 * return code is the size in bytes of the string
1023 * On error: return code is a negative errno value
1024 *
1025 * OR
1026 *
1027 * Input:
1028 * buf: C string containing a transport
1029 * name and an unsigned integer value
1030 * representing the port to listen on,
1031 * separated by whitespace
1032 * size: non-zero length of C string in @buf
1033 * Output:
1034 * On success: returns zero; NFS service is started
1035 * On error: return code is a negative errno value
1036 *
1037 * OR
1038 *
1039 * Input:
1040 * buf: C string containing a "-" followed
1041 * by a transport name and an unsigned
1042 * integer value representing the port
1043 * to listen on, separated by whitespace
1044 * size: non-zero length of C string in @buf
1045 * Output:
1046 * On success: returns zero; NFS service no longer listens
1047 * on that transport
1048 * On error: return code is a negative errno value
1049 */
690static ssize_t write_ports(struct file *file, char *buf, size_t size) 1050static ssize_t write_ports(struct file *file, char *buf, size_t size)
691{ 1051{
692 ssize_t rv; 1052 ssize_t rv;
@@ -700,6 +1060,27 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size)
700 1060
701int nfsd_max_blksize; 1061int nfsd_max_blksize;
702 1062
1063/**
1064 * write_maxblksize - Set or report the current NFS blksize
1065 *
1066 * Input:
1067 * buf: ignored
1068 * size: zero
1069 *
1070 * OR
1071 *
1072 * Input:
1073 * buf: C string containing an unsigned
1074 * integer value representing the new
1075 * NFS blksize
1076 * size: non-zero length of C string in @buf
1077 * Output:
1078 * On success: passed-in buffer filled with '\n'-terminated C string
1079 * containing numeric value of the current NFS blksize
1080 * setting;
1081 * return code is the size in bytes of the string
1082 * On error: return code is zero or a negative errno value
1083 */
703static ssize_t write_maxblksize(struct file *file, char *buf, size_t size) 1084static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
704{ 1085{
705 char *mesg = buf; 1086 char *mesg = buf;
@@ -752,6 +1133,27 @@ static ssize_t __write_leasetime(struct file *file, char *buf, size_t size)
752 return strlen(buf); 1133 return strlen(buf);
753} 1134}
754 1135
1136/**
1137 * write_leasetime - Set or report the current NFSv4 lease time
1138 *
1139 * Input:
1140 * buf: ignored
1141 * size: zero
1142 *
1143 * OR
1144 *
1145 * Input:
1146 * buf: C string containing an unsigned
1147 * integer value representing the new
1148 * NFSv4 lease expiry time
1149 * size: non-zero length of C string in @buf
1150 * Output:
1151 * On success: passed-in buffer filled with '\n'-terminated C
1152 * string containing unsigned integer value of the
1153 * current lease expiry time;
1154 * return code is the size in bytes of the string
1155 * On error: return code is zero or a negative errno value
1156 */
755static ssize_t write_leasetime(struct file *file, char *buf, size_t size) 1157static ssize_t write_leasetime(struct file *file, char *buf, size_t size)
756{ 1158{
757 ssize_t rv; 1159 ssize_t rv;
@@ -788,6 +1190,27 @@ static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size)
788 return strlen(buf); 1190 return strlen(buf);
789} 1191}
790 1192
1193/**
1194 * write_recoverydir - Set or report the pathname of the recovery directory
1195 *
1196 * Input:
1197 * buf: ignored
1198 * size: zero
1199 *
1200 * OR
1201 *
1202 * Input:
1203 * buf: C string containing the pathname
1204 * of the directory on a local file
1205 * system containing permanent NFSv4
1206 * recovery data
1207 * size: non-zero length of C string in @buf
1208 * Output:
1209 * On success: passed-in buffer filled with '\n'-terminated C string
1210 * containing the current recovery pathname setting;
1211 * return code is the size in bytes of the string
1212 * On error: return code is zero or a negative errno value
1213 */
791static ssize_t write_recoverydir(struct file *file, char *buf, size_t size) 1214static ssize_t write_recoverydir(struct file *file, char *buf, size_t size)
792{ 1215{
793 ssize_t rv; 1216 ssize_t rv;
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index f0da7d9c3a92..9f1ca17293d3 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -258,14 +258,32 @@ out:
258 return error; 258 return error;
259} 259}
260 260
261/* 261/**
262 * Perform sanity checks on the dentry in a client's file handle. 262 * fh_verify - filehandle lookup and access checking
263 * @rqstp: pointer to current rpc request
264 * @fhp: filehandle to be verified
265 * @type: expected type of object pointed to by filehandle
266 * @access: type of access needed to object
267 *
268 * Look up a dentry from the on-the-wire filehandle, check the client's
269 * access to the export, and set the current task's credentials.
270 *
271 * Regardless of success or failure of fh_verify(), fh_put() should be
272 * called on @fhp when the caller is finished with the filehandle.
263 * 273 *
264 * Note that the file handle dentry may need to be freed even after 274 * fh_verify() may be called multiple times on a given filehandle, for
265 * an error return. 275 * example, when processing an NFSv4 compound. The first call will look
276 * up a dentry using the on-the-wire filehandle. Subsequent calls will
277 * skip the lookup and just perform the other checks and possibly change
278 * the current task's credentials.
266 * 279 *
267 * This is only called at the start of an nfsproc call, so fhp points to 280 * @type specifies the type of object expected using one of the S_IF*
268 * a svc_fh which is all 0 except for the over-the-wire file handle. 281 * constants defined in include/linux/stat.h. The caller may use zero
282 * to indicate that it doesn't care, or a negative integer to indicate
283 * that it expects something not of the given type.
284 *
285 * @access is formed from the NFSD_MAY_* constants defined in
286 * include/linux/nfsd/nfsd.h.
269 */ 287 */
270__be32 288__be32
271fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access) 289fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
@@ -466,6 +484,8 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
466 goto retry; 484 goto retry;
467 break; 485 break;
468 } 486 }
487 } else if (exp->ex_flags & NFSEXP_FSID) {
488 fsid_type = FSID_NUM;
469 } else if (exp->ex_uuid) { 489 } else if (exp->ex_uuid) {
470 if (fhp->fh_maxsize >= 64) { 490 if (fhp->fh_maxsize >= 64) {
471 if (root_export) 491 if (root_export)
@@ -478,9 +498,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
478 else 498 else
479 fsid_type = FSID_UUID4_INUM; 499 fsid_type = FSID_UUID4_INUM;
480 } 500 }
481 } else if (exp->ex_flags & NFSEXP_FSID) 501 } else if (!old_valid_dev(ex_dev))
482 fsid_type = FSID_NUM;
483 else if (!old_valid_dev(ex_dev))
484 /* for newer device numbers, we must use a newer fsid format */ 502 /* for newer device numbers, we must use a newer fsid format */
485 fsid_type = FSID_ENCODE_DEV; 503 fsid_type = FSID_ENCODE_DEV;
486 else 504 else
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 5cffeca7acef..6f7f26351227 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -622,6 +622,7 @@ nfserrno (int errno)
622 { nfserr_badname, -ESRCH }, 622 { nfserr_badname, -ESRCH },
623 { nfserr_io, -ETXTBSY }, 623 { nfserr_io, -ETXTBSY },
624 { nfserr_notsupp, -EOPNOTSUPP }, 624 { nfserr_notsupp, -EOPNOTSUPP },
625 { nfserr_toosmall, -ETOOSMALL },
625 }; 626 };
626 int i; 627 int i;
627 628
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index d1c5f787b365..6e50aaa56ca2 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -764,7 +764,6 @@ static inline int nfsd_dosync(struct file *filp, struct dentry *dp,
764 764
765 return err; 765 return err;
766} 766}
767
768 767
769static int 768static int
770nfsd_sync(struct file *filp) 769nfsd_sync(struct file *filp)
@@ -1211,7 +1210,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
1211 dirp = dentry->d_inode; 1210 dirp = dentry->d_inode;
1212 1211
1213 err = nfserr_notdir; 1212 err = nfserr_notdir;
1214 if(!dirp->i_op || !dirp->i_op->lookup) 1213 if (!dirp->i_op->lookup)
1215 goto out; 1214 goto out;
1216 /* 1215 /*
1217 * Check whether the response file handle has been verified yet. 1216 * Check whether the response file handle has been verified yet.
@@ -1347,7 +1346,7 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
1347 /* Get all the sanity checks out of the way before 1346 /* Get all the sanity checks out of the way before
1348 * we lock the parent. */ 1347 * we lock the parent. */
1349 err = nfserr_notdir; 1348 err = nfserr_notdir;
1350 if(!dirp->i_op || !dirp->i_op->lookup) 1349 if (!dirp->i_op->lookup)
1351 goto out; 1350 goto out;
1352 fh_lock_nested(fhp, I_MUTEX_PARENT); 1351 fh_lock_nested(fhp, I_MUTEX_PARENT);
1353 1352
@@ -1482,7 +1481,7 @@ nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp)
1482 inode = dentry->d_inode; 1481 inode = dentry->d_inode;
1483 1482
1484 err = nfserr_inval; 1483 err = nfserr_inval;
1485 if (!inode->i_op || !inode->i_op->readlink) 1484 if (!inode->i_op->readlink)
1486 goto out; 1485 goto out;
1487 1486
1488 touch_atime(fhp->fh_export->ex_path.mnt, dentry); 1487 touch_atime(fhp->fh_export->ex_path.mnt, dentry);
@@ -2162,7 +2161,7 @@ nfsd_set_posix_acl(struct svc_fh *fhp, int type, struct posix_acl *acl)
2162 size_t size; 2161 size_t size;
2163 int error; 2162 int error;
2164 2163
2165 if (!IS_POSIXACL(inode) || !inode->i_op || 2164 if (!IS_POSIXACL(inode) ||
2166 !inode->i_op->setxattr || !inode->i_op->removexattr) 2165 !inode->i_op->setxattr || !inode->i_op->removexattr)
2167 return -EOPNOTSUPP; 2166 return -EOPNOTSUPP;
2168 switch(type) { 2167 switch(type) {
diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig
new file mode 100644
index 000000000000..50914d7303c6
--- /dev/null
+++ b/fs/notify/Kconfig
@@ -0,0 +1,2 @@
1source "fs/notify/dnotify/Kconfig"
2source "fs/notify/inotify/Kconfig"
diff --git a/fs/notify/Makefile b/fs/notify/Makefile
new file mode 100644
index 000000000000..5a95b6010ce7
--- /dev/null
+++ b/fs/notify/Makefile
@@ -0,0 +1,2 @@
1obj-y += dnotify/
2obj-y += inotify/
diff --git a/fs/notify/dnotify/Kconfig b/fs/notify/dnotify/Kconfig
new file mode 100644
index 000000000000..26adf5dfa646
--- /dev/null
+++ b/fs/notify/dnotify/Kconfig
@@ -0,0 +1,10 @@
1config DNOTIFY
2 bool "Dnotify support"
3 default y
4 help
5 Dnotify is a directory-based per-fd file change notification system
6 that uses signals to communicate events to user-space. There exist
7 superior alternatives, but some applications may still rely on
8 dnotify.
9
10 If unsure, say Y.
diff --git a/fs/notify/dnotify/Makefile b/fs/notify/dnotify/Makefile
new file mode 100644
index 000000000000..f145251dcadb
--- /dev/null
+++ b/fs/notify/dnotify/Makefile
@@ -0,0 +1 @@
obj-$(CONFIG_DNOTIFY) += dnotify.o
diff --git a/fs/dnotify.c b/fs/notify/dnotify/dnotify.c
index 676073b8dda5..b0aa2cde80bd 100644
--- a/fs/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -115,9 +115,6 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
115 dn->dn_next = inode->i_dnotify; 115 dn->dn_next = inode->i_dnotify;
116 inode->i_dnotify = dn; 116 inode->i_dnotify = dn;
117 spin_unlock(&inode->i_lock); 117 spin_unlock(&inode->i_lock);
118
119 if (filp->f_op && filp->f_op->dir_notify)
120 return filp->f_op->dir_notify(filp, arg);
121 return 0; 118 return 0;
122 119
123out_free: 120out_free:
diff --git a/fs/notify/inotify/Kconfig b/fs/notify/inotify/Kconfig
new file mode 100644
index 000000000000..446792841023
--- /dev/null
+++ b/fs/notify/inotify/Kconfig
@@ -0,0 +1,27 @@
1config INOTIFY
2 bool "Inotify file change notification support"
3 default y
4 ---help---
5 Say Y here to enable inotify support. Inotify is a file change
6 notification system and a replacement for dnotify. Inotify fixes
7 numerous shortcomings in dnotify and introduces several new features
8 including multiple file events, one-shot support, and unmount
9 notification.
10
11 For more information, see <file:Documentation/filesystems/inotify.txt>
12
13 If unsure, say Y.
14
15config INOTIFY_USER
16 bool "Inotify support for userspace"
17 depends on INOTIFY
18 default y
19 ---help---
20 Say Y here to enable inotify support for userspace, including the
21 associated system calls. Inotify allows monitoring of both files and
22 directories via a single open fd. Events are read from the file
23 descriptor, which is also select()- and poll()-able.
24
25 For more information, see <file:Documentation/filesystems/inotify.txt>
26
27 If unsure, say Y.
diff --git a/fs/notify/inotify/Makefile b/fs/notify/inotify/Makefile
new file mode 100644
index 000000000000..e290f3bb9d8d
--- /dev/null
+++ b/fs/notify/inotify/Makefile
@@ -0,0 +1,2 @@
1obj-$(CONFIG_INOTIFY) += inotify.o
2obj-$(CONFIG_INOTIFY_USER) += inotify_user.o
diff --git a/fs/inotify.c b/fs/notify/inotify/inotify.c
index dae3f28f30d4..dae3f28f30d4 100644
--- a/fs/inotify.c
+++ b/fs/notify/inotify/inotify.c
diff --git a/fs/inotify_user.c b/fs/notify/inotify/inotify_user.c
index e2425bbd871f..bed766e435b5 100644
--- a/fs/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -76,10 +76,10 @@ struct inotify_device {
76 struct mutex ev_mutex; /* protects event queue */ 76 struct mutex ev_mutex; /* protects event queue */
77 struct mutex up_mutex; /* synchronizes watch updates */ 77 struct mutex up_mutex; /* synchronizes watch updates */
78 struct list_head events; /* list of queued events */ 78 struct list_head events; /* list of queued events */
79 atomic_t count; /* reference count */
80 struct user_struct *user; /* user who opened this dev */ 79 struct user_struct *user; /* user who opened this dev */
81 struct inotify_handle *ih; /* inotify handle */ 80 struct inotify_handle *ih; /* inotify handle */
82 struct fasync_struct *fa; /* async notification */ 81 struct fasync_struct *fa; /* async notification */
82 atomic_t count; /* reference count */
83 unsigned int queue_size; /* size of the queue (bytes) */ 83 unsigned int queue_size; /* size of the queue (bytes) */
84 unsigned int event_count; /* number of pending events */ 84 unsigned int event_count; /* number of pending events */
85 unsigned int max_events; /* maximum number of events */ 85 unsigned int max_events; /* maximum number of events */
@@ -427,10 +427,61 @@ static unsigned int inotify_poll(struct file *file, poll_table *wait)
427 return ret; 427 return ret;
428} 428}
429 429
430/*
431 * Get an inotify_kernel_event if one exists and is small
432 * enough to fit in "count". Return an error pointer if
433 * not large enough.
434 *
435 * Called with the device ev_mutex held.
436 */
437static struct inotify_kernel_event *get_one_event(struct inotify_device *dev,
438 size_t count)
439{
440 size_t event_size = sizeof(struct inotify_event);
441 struct inotify_kernel_event *kevent;
442
443 if (list_empty(&dev->events))
444 return NULL;
445
446 kevent = inotify_dev_get_event(dev);
447 if (kevent->name)
448 event_size += kevent->event.len;
449
450 if (event_size > count)
451 return ERR_PTR(-EINVAL);
452
453 remove_kevent(dev, kevent);
454 return kevent;
455}
456
457/*
458 * Copy an event to user space, returning how much we copied.
459 *
460 * We already checked that the event size is smaller than the
461 * buffer we had in "get_one_event()" above.
462 */
463static ssize_t copy_event_to_user(struct inotify_kernel_event *kevent,
464 char __user *buf)
465{
466 size_t event_size = sizeof(struct inotify_event);
467
468 if (copy_to_user(buf, &kevent->event, event_size))
469 return -EFAULT;
470
471 if (kevent->name) {
472 buf += event_size;
473
474 if (copy_to_user(buf, kevent->name, kevent->event.len))
475 return -EFAULT;
476
477 event_size += kevent->event.len;
478 }
479 return event_size;
480}
481
430static ssize_t inotify_read(struct file *file, char __user *buf, 482static ssize_t inotify_read(struct file *file, char __user *buf,
431 size_t count, loff_t *pos) 483 size_t count, loff_t *pos)
432{ 484{
433 size_t event_size = sizeof (struct inotify_event);
434 struct inotify_device *dev; 485 struct inotify_device *dev;
435 char __user *start; 486 char __user *start;
436 int ret; 487 int ret;
@@ -440,81 +491,43 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
440 dev = file->private_data; 491 dev = file->private_data;
441 492
442 while (1) { 493 while (1) {
494 struct inotify_kernel_event *kevent;
443 495
444 prepare_to_wait(&dev->wq, &wait, TASK_INTERRUPTIBLE); 496 prepare_to_wait(&dev->wq, &wait, TASK_INTERRUPTIBLE);
445 497
446 mutex_lock(&dev->ev_mutex); 498 mutex_lock(&dev->ev_mutex);
447 if (!list_empty(&dev->events)) { 499 kevent = get_one_event(dev, count);
448 ret = 0;
449 break;
450 }
451 mutex_unlock(&dev->ev_mutex); 500 mutex_unlock(&dev->ev_mutex);
452 501
453 if (file->f_flags & O_NONBLOCK) { 502 if (kevent) {
454 ret = -EAGAIN; 503 ret = PTR_ERR(kevent);
455 break; 504 if (IS_ERR(kevent))
456 } 505 break;
457 506 ret = copy_event_to_user(kevent, buf);
458 if (signal_pending(current)) { 507 free_kevent(kevent);
459 ret = -EINTR; 508 if (ret < 0)
460 break; 509 break;
510 buf += ret;
511 count -= ret;
512 continue;
461 } 513 }
462 514
463 schedule(); 515 ret = -EAGAIN;
464 } 516 if (file->f_flags & O_NONBLOCK)
465
466 finish_wait(&dev->wq, &wait);
467 if (ret)
468 return ret;
469
470 while (1) {
471 struct inotify_kernel_event *kevent;
472
473 ret = buf - start;
474 if (list_empty(&dev->events))
475 break; 517 break;
476 518 ret = -EINTR;
477 kevent = inotify_dev_get_event(dev); 519 if (signal_pending(current))
478 if (event_size + kevent->event.len > count) {
479 if (ret == 0 && count > 0) {
480 /*
481 * could not get a single event because we
482 * didn't have enough buffer space.
483 */
484 ret = -EINVAL;
485 }
486 break; 520 break;
487 }
488 remove_kevent(dev, kevent);
489 521
490 /* 522 if (start != buf)
491 * Must perform the copy_to_user outside the mutex in order
492 * to avoid a lock order reversal with mmap_sem.
493 */
494 mutex_unlock(&dev->ev_mutex);
495
496 if (copy_to_user(buf, &kevent->event, event_size)) {
497 ret = -EFAULT;
498 break; 523 break;
499 }
500 buf += event_size;
501 count -= event_size;
502
503 if (kevent->name) {
504 if (copy_to_user(buf, kevent->name, kevent->event.len)){
505 ret = -EFAULT;
506 break;
507 }
508 buf += kevent->event.len;
509 count -= kevent->event.len;
510 }
511
512 free_kevent(kevent);
513 524
514 mutex_lock(&dev->ev_mutex); 525 schedule();
515 } 526 }
516 mutex_unlock(&dev->ev_mutex);
517 527
528 finish_wait(&dev->wq, &wait);
529 if (start != buf && ret != -EFAULT)
530 ret = buf - start;
518 return ret; 531 return ret;
519} 532}
520 533
@@ -576,7 +589,7 @@ static const struct inotify_operations inotify_user_ops = {
576 .destroy_watch = free_inotify_user_watch, 589 .destroy_watch = free_inotify_user_watch,
577}; 590};
578 591
579asmlinkage long sys_inotify_init1(int flags) 592SYSCALL_DEFINE1(inotify_init1, int, flags)
580{ 593{
581 struct inotify_device *dev; 594 struct inotify_device *dev;
582 struct inotify_handle *ih; 595 struct inotify_handle *ih;
@@ -655,12 +668,13 @@ out_put_fd:
655 return ret; 668 return ret;
656} 669}
657 670
658asmlinkage long sys_inotify_init(void) 671SYSCALL_DEFINE0(inotify_init)
659{ 672{
660 return sys_inotify_init1(0); 673 return sys_inotify_init1(0);
661} 674}
662 675
663asmlinkage long sys_inotify_add_watch(int fd, const char __user *pathname, u32 mask) 676SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
677 u32, mask)
664{ 678{
665 struct inode *inode; 679 struct inode *inode;
666 struct inotify_device *dev; 680 struct inotify_device *dev;
@@ -704,7 +718,7 @@ fput_and_out:
704 return ret; 718 return ret;
705} 719}
706 720
707asmlinkage long sys_inotify_rm_watch(int fd, u32 wd) 721SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd)
708{ 722{
709 struct file *filp; 723 struct file *filp;
710 struct inotify_device *dev; 724 struct inotify_device *dev;
diff --git a/fs/ntfs/Kconfig b/fs/ntfs/Kconfig
new file mode 100644
index 000000000000..f5a868cc9152
--- /dev/null
+++ b/fs/ntfs/Kconfig
@@ -0,0 +1,78 @@
1config NTFS_FS
2 tristate "NTFS file system support"
3 select NLS
4 help
5 NTFS is the file system of Microsoft Windows NT, 2000, XP and 2003.
6
7 Saying Y or M here enables read support. There is partial, but
8 safe, write support available. For write support you must also
9 say Y to "NTFS write support" below.
10
11 There are also a number of user-space tools available, called
12 ntfsprogs. These include ntfsundelete and ntfsresize, that work
13 without NTFS support enabled in the kernel.
14
15 This is a rewrite from scratch of Linux NTFS support and replaced
16 the old NTFS code starting with Linux 2.5.11. A backport to
17 the Linux 2.4 kernel series is separately available as a patch
18 from the project web site.
19
20 For more information see <file:Documentation/filesystems/ntfs.txt>
21 and <http://www.linux-ntfs.org/>.
22
23 To compile this file system support as a module, choose M here: the
24 module will be called ntfs.
25
26 If you are not using Windows NT, 2000, XP or 2003 in addition to
27 Linux on your computer it is safe to say N.
28
29config NTFS_DEBUG
30 bool "NTFS debugging support"
31 depends on NTFS_FS
32 help
33 If you are experiencing any problems with the NTFS file system, say
34 Y here. This will result in additional consistency checks to be
35 performed by the driver as well as additional debugging messages to
36 be written to the system log. Note that debugging messages are
37 disabled by default. To enable them, supply the option debug_msgs=1
38 at the kernel command line when booting the kernel or as an option
39 to insmod when loading the ntfs module. Once the driver is active,
40 you can enable debugging messages by doing (as root):
41 echo 1 > /proc/sys/fs/ntfs-debug
42 Replacing the "1" with "0" would disable debug messages.
43
44 If you leave debugging messages disabled, this results in little
45 overhead, but enabling debug messages results in very significant
46 slowdown of the system.
47
48 When reporting bugs, please try to have available a full dump of
49 debugging messages while the misbehaviour was occurring.
50
51config NTFS_RW
52 bool "NTFS write support"
53 depends on NTFS_FS
54 help
55 This enables the partial, but safe, write support in the NTFS driver.
56
57 The only supported operation is overwriting existing files, without
58 changing the file length. No file or directory creation, deletion or
59 renaming is possible. Note only non-resident files can be written to
60 so you may find that some very small files (<500 bytes or so) cannot
61 be written to.
62
63 While we cannot guarantee that it will not damage any data, we have
64 so far not received a single report where the driver would have
65 damaged someones data so we assume it is perfectly safe to use.
66
67 Note: While write support is safe in this version (a rewrite from
68 scratch of the NTFS support), it should be noted that the old NTFS
69 write support, included in Linux 2.5.10 and before (since 1997),
70 is not safe.
71
72 This is currently useful with TopologiLinux. TopologiLinux is run
73 on top of any DOS/Microsoft Windows system without partitioning your
74 hard disk. Unlike other Linux distributions TopologiLinux does not
75 need its own partition. For more information see
76 <http://topologi-linux.sourceforge.net/>
77
78 It is perfectly safe to say N here.
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index e9da092e2772..86bef156cf0a 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -1406,9 +1406,6 @@ static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi)
1406 ni->allocated_size = sle64_to_cpu( 1406 ni->allocated_size = sle64_to_cpu(
1407 a->data.non_resident.allocated_size); 1407 a->data.non_resident.allocated_size);
1408 } 1408 }
1409 /* Setup the operations for this attribute inode. */
1410 vi->i_op = NULL;
1411 vi->i_fop = NULL;
1412 if (NInoMstProtected(ni)) 1409 if (NInoMstProtected(ni))
1413 vi->i_mapping->a_ops = &ntfs_mst_aops; 1410 vi->i_mapping->a_ops = &ntfs_mst_aops;
1414 else 1411 else
diff --git a/fs/ocfs2/Kconfig b/fs/ocfs2/Kconfig
new file mode 100644
index 000000000000..701b7a3a872e
--- /dev/null
+++ b/fs/ocfs2/Kconfig
@@ -0,0 +1,85 @@
1config OCFS2_FS
2 tristate "OCFS2 file system support"
3 depends on NET && SYSFS
4 select CONFIGFS_FS
5 select JBD2
6 select CRC32
7 select QUOTA
8 select QUOTA_TREE
9 help
10 OCFS2 is a general purpose extent based shared disk cluster file
11 system with many similarities to ext3. It supports 64 bit inode
12 numbers, and has automatically extending metadata groups which may
13 also make it attractive for non-clustered use.
14
15 You'll want to install the ocfs2-tools package in order to at least
16 get "mount.ocfs2".
17
18 Project web page: http://oss.oracle.com/projects/ocfs2
19 Tools web page: http://oss.oracle.com/projects/ocfs2-tools
20 OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/
21
22 For more information on OCFS2, see the file
23 <file:Documentation/filesystems/ocfs2.txt>.
24
25config OCFS2_FS_O2CB
26 tristate "O2CB Kernelspace Clustering"
27 depends on OCFS2_FS
28 default y
29 help
30 OCFS2 includes a simple kernelspace clustering package, the OCFS2
31 Cluster Base. It only requires a very small userspace component
32 to configure it. This comes with the standard ocfs2-tools package.
33 O2CB is limited to maintaining a cluster for OCFS2 file systems.
34 It cannot manage any other cluster applications.
35
36 It is always safe to say Y here, as the clustering method is
37 run-time selectable.
38
39config OCFS2_FS_USERSPACE_CLUSTER
40 tristate "OCFS2 Userspace Clustering"
41 depends on OCFS2_FS && DLM
42 default y
43 help
44 This option will allow OCFS2 to use userspace clustering services
45 in conjunction with the DLM in fs/dlm. If you are using a
46 userspace cluster manager, say Y here.
47
48 It is safe to say Y, as the clustering method is run-time
49 selectable.
50
51config OCFS2_FS_STATS
52 bool "OCFS2 statistics"
53 depends on OCFS2_FS
54 default y
55 help
56 This option allows some fs statistics to be captured. Enabling
57 this option may increase the memory consumption.
58
59config OCFS2_DEBUG_MASKLOG
60 bool "OCFS2 logging support"
61 depends on OCFS2_FS
62 default y
63 help
64 The ocfs2 filesystem has an extensive logging system. The system
65 allows selection of events to log via files in /sys/o2cb/logmask/.
66 This option will enlarge your kernel, but it allows debugging of
67 ocfs2 filesystem issues.
68
69config OCFS2_DEBUG_FS
70 bool "OCFS2 expensive checks"
71 depends on OCFS2_FS
72 default n
73 help
74 This option will enable expensive consistency checks. Enable
75 this option for debugging only as it is likely to decrease
76 performance of the filesystem.
77
78config OCFS2_FS_POSIX_ACL
79 bool "OCFS2 POSIX Access Control Lists"
80 depends on OCFS2_FS
81 select FS_POSIX_ACL
82 default n
83 help
84 Posix Access Control Lists (ACLs) support permissions for users and
85 groups beyond the owner/group/world scheme.
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 589dcdfdfe3c..01596079dd63 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -12,6 +12,7 @@ obj-$(CONFIG_OCFS2_FS_USERSPACE_CLUSTER) += ocfs2_stack_user.o
12ocfs2-objs := \ 12ocfs2-objs := \
13 alloc.o \ 13 alloc.o \
14 aops.o \ 14 aops.o \
15 blockcheck.o \
15 buffer_head_io.o \ 16 buffer_head_io.o \
16 dcache.o \ 17 dcache.o \
17 dir.o \ 18 dir.o \
@@ -35,8 +36,14 @@ ocfs2-objs := \
35 sysfile.o \ 36 sysfile.o \
36 uptodate.o \ 37 uptodate.o \
37 ver.o \ 38 ver.o \
39 quota_local.o \
40 quota_global.o \
38 xattr.o 41 xattr.o
39 42
43ifeq ($(CONFIG_OCFS2_FS_POSIX_ACL),y)
44ocfs2-objs += acl.o
45endif
46
40ocfs2_stackglue-objs := stackglue.o 47ocfs2_stackglue-objs := stackglue.o
41ocfs2_stack_o2cb-objs := stack_o2cb.o 48ocfs2_stack_o2cb-objs := stack_o2cb.o
42ocfs2_stack_user-objs := stack_user.o 49ocfs2_stack_user-objs := stack_user.o
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
new file mode 100644
index 000000000000..12dfb44c22e5
--- /dev/null
+++ b/fs/ocfs2/acl.c
@@ -0,0 +1,479 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * acl.c
5 *
6 * Copyright (C) 2004, 2008 Oracle. All rights reserved.
7 *
8 * CREDITS:
9 * Lots of code in this file is copy from linux/fs/ext3/acl.c.
10 * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public
14 * License version 2 as published by the Free Software Foundation.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * General Public License for more details.
20 */
21
22#include <linux/init.h>
23#include <linux/module.h>
24#include <linux/string.h>
25
26#define MLOG_MASK_PREFIX ML_INODE
27#include <cluster/masklog.h>
28
29#include "ocfs2.h"
30#include "alloc.h"
31#include "dlmglue.h"
32#include "file.h"
33#include "ocfs2_fs.h"
34
35#include "xattr.h"
36#include "acl.h"
37
38/*
39 * Convert from xattr value to acl struct.
40 */
41static struct posix_acl *ocfs2_acl_from_xattr(const void *value, size_t size)
42{
43 int n, count;
44 struct posix_acl *acl;
45
46 if (!value)
47 return NULL;
48 if (size < sizeof(struct posix_acl_entry))
49 return ERR_PTR(-EINVAL);
50
51 count = size / sizeof(struct posix_acl_entry);
52 if (count < 0)
53 return ERR_PTR(-EINVAL);
54 if (count == 0)
55 return NULL;
56
57 acl = posix_acl_alloc(count, GFP_NOFS);
58 if (!acl)
59 return ERR_PTR(-ENOMEM);
60 for (n = 0; n < count; n++) {
61 struct ocfs2_acl_entry *entry =
62 (struct ocfs2_acl_entry *)value;
63
64 acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag);
65 acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);
66 acl->a_entries[n].e_id = le32_to_cpu(entry->e_id);
67 value += sizeof(struct posix_acl_entry);
68
69 }
70 return acl;
71}
72
73/*
74 * Convert acl struct to xattr value.
75 */
76static void *ocfs2_acl_to_xattr(const struct posix_acl *acl, size_t *size)
77{
78 struct ocfs2_acl_entry *entry = NULL;
79 char *ocfs2_acl;
80 size_t n;
81
82 *size = acl->a_count * sizeof(struct posix_acl_entry);
83
84 ocfs2_acl = kmalloc(*size, GFP_NOFS);
85 if (!ocfs2_acl)
86 return ERR_PTR(-ENOMEM);
87
88 entry = (struct ocfs2_acl_entry *)ocfs2_acl;
89 for (n = 0; n < acl->a_count; n++, entry++) {
90 entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag);
91 entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm);
92 entry->e_id = cpu_to_le32(acl->a_entries[n].e_id);
93 }
94 return ocfs2_acl;
95}
96
97static struct posix_acl *ocfs2_get_acl_nolock(struct inode *inode,
98 int type,
99 struct buffer_head *di_bh)
100{
101 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
102 int name_index;
103 char *value = NULL;
104 struct posix_acl *acl;
105 int retval;
106
107 if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
108 return NULL;
109
110 switch (type) {
111 case ACL_TYPE_ACCESS:
112 name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS;
113 break;
114 case ACL_TYPE_DEFAULT:
115 name_index = OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT;
116 break;
117 default:
118 return ERR_PTR(-EINVAL);
119 }
120
121 retval = ocfs2_xattr_get_nolock(inode, di_bh, name_index, "", NULL, 0);
122 if (retval > 0) {
123 value = kmalloc(retval, GFP_NOFS);
124 if (!value)
125 return ERR_PTR(-ENOMEM);
126 retval = ocfs2_xattr_get_nolock(inode, di_bh, name_index,
127 "", value, retval);
128 }
129
130 if (retval > 0)
131 acl = ocfs2_acl_from_xattr(value, retval);
132 else if (retval == -ENODATA || retval == 0)
133 acl = NULL;
134 else
135 acl = ERR_PTR(retval);
136
137 kfree(value);
138
139 return acl;
140}
141
142
143/*
144 * Get posix acl.
145 */
146static struct posix_acl *ocfs2_get_acl(struct inode *inode, int type)
147{
148 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
149 struct buffer_head *di_bh = NULL;
150 struct posix_acl *acl;
151 int ret;
152
153 if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
154 return NULL;
155
156 ret = ocfs2_inode_lock(inode, &di_bh, 0);
157 if (ret < 0) {
158 mlog_errno(ret);
159 acl = ERR_PTR(ret);
160 return acl;
161 }
162
163 acl = ocfs2_get_acl_nolock(inode, type, di_bh);
164
165 ocfs2_inode_unlock(inode, 0);
166
167 brelse(di_bh);
168
169 return acl;
170}
171
172/*
173 * Set the access or default ACL of an inode.
174 */
175static int ocfs2_set_acl(handle_t *handle,
176 struct inode *inode,
177 struct buffer_head *di_bh,
178 int type,
179 struct posix_acl *acl,
180 struct ocfs2_alloc_context *meta_ac,
181 struct ocfs2_alloc_context *data_ac)
182{
183 int name_index;
184 void *value = NULL;
185 size_t size = 0;
186 int ret;
187
188 if (S_ISLNK(inode->i_mode))
189 return -EOPNOTSUPP;
190
191 switch (type) {
192 case ACL_TYPE_ACCESS:
193 name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS;
194 if (acl) {
195 mode_t mode = inode->i_mode;
196 ret = posix_acl_equiv_mode(acl, &mode);
197 if (ret < 0)
198 return ret;
199 else {
200 inode->i_mode = mode;
201 if (ret == 0)
202 acl = NULL;
203 }
204 }
205 break;
206 case ACL_TYPE_DEFAULT:
207 name_index = OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT;
208 if (!S_ISDIR(inode->i_mode))
209 return acl ? -EACCES : 0;
210 break;
211 default:
212 return -EINVAL;
213 }
214
215 if (acl) {
216 value = ocfs2_acl_to_xattr(acl, &size);
217 if (IS_ERR(value))
218 return (int)PTR_ERR(value);
219 }
220
221 if (handle)
222 ret = ocfs2_xattr_set_handle(handle, inode, di_bh, name_index,
223 "", value, size, 0,
224 meta_ac, data_ac);
225 else
226 ret = ocfs2_xattr_set(inode, name_index, "", value, size, 0);
227
228 kfree(value);
229
230 return ret;
231}
232
233int ocfs2_check_acl(struct inode *inode, int mask)
234{
235 struct posix_acl *acl = ocfs2_get_acl(inode, ACL_TYPE_ACCESS);
236
237 if (IS_ERR(acl))
238 return PTR_ERR(acl);
239 if (acl) {
240 int ret = posix_acl_permission(inode, acl, mask);
241 posix_acl_release(acl);
242 return ret;
243 }
244
245 return -EAGAIN;
246}
247
248int ocfs2_acl_chmod(struct inode *inode)
249{
250 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
251 struct posix_acl *acl, *clone;
252 int ret;
253
254 if (S_ISLNK(inode->i_mode))
255 return -EOPNOTSUPP;
256
257 if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
258 return 0;
259
260 acl = ocfs2_get_acl(inode, ACL_TYPE_ACCESS);
261 if (IS_ERR(acl) || !acl)
262 return PTR_ERR(acl);
263 clone = posix_acl_clone(acl, GFP_KERNEL);
264 posix_acl_release(acl);
265 if (!clone)
266 return -ENOMEM;
267 ret = posix_acl_chmod_masq(clone, inode->i_mode);
268 if (!ret)
269 ret = ocfs2_set_acl(NULL, inode, NULL, ACL_TYPE_ACCESS,
270 clone, NULL, NULL);
271 posix_acl_release(clone);
272 return ret;
273}
274
275/*
276 * Initialize the ACLs of a new inode. If parent directory has default ACL,
277 * then clone to new inode. Called from ocfs2_mknod.
278 */
279int ocfs2_init_acl(handle_t *handle,
280 struct inode *inode,
281 struct inode *dir,
282 struct buffer_head *di_bh,
283 struct buffer_head *dir_bh,
284 struct ocfs2_alloc_context *meta_ac,
285 struct ocfs2_alloc_context *data_ac)
286{
287 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
288 struct posix_acl *acl = NULL;
289 int ret = 0;
290
291 if (!S_ISLNK(inode->i_mode)) {
292 if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) {
293 acl = ocfs2_get_acl_nolock(dir, ACL_TYPE_DEFAULT,
294 dir_bh);
295 if (IS_ERR(acl))
296 return PTR_ERR(acl);
297 }
298 if (!acl)
299 inode->i_mode &= ~current->fs->umask;
300 }
301 if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) {
302 struct posix_acl *clone;
303 mode_t mode;
304
305 if (S_ISDIR(inode->i_mode)) {
306 ret = ocfs2_set_acl(handle, inode, di_bh,
307 ACL_TYPE_DEFAULT, acl,
308 meta_ac, data_ac);
309 if (ret)
310 goto cleanup;
311 }
312 clone = posix_acl_clone(acl, GFP_NOFS);
313 ret = -ENOMEM;
314 if (!clone)
315 goto cleanup;
316
317 mode = inode->i_mode;
318 ret = posix_acl_create_masq(clone, &mode);
319 if (ret >= 0) {
320 inode->i_mode = mode;
321 if (ret > 0) {
322 ret = ocfs2_set_acl(handle, inode,
323 di_bh, ACL_TYPE_ACCESS,
324 clone, meta_ac, data_ac);
325 }
326 }
327 posix_acl_release(clone);
328 }
329cleanup:
330 posix_acl_release(acl);
331 return ret;
332}
333
334static size_t ocfs2_xattr_list_acl_access(struct inode *inode,
335 char *list,
336 size_t list_len,
337 const char *name,
338 size_t name_len)
339{
340 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
341 const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
342
343 if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
344 return 0;
345
346 if (list && size <= list_len)
347 memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
348 return size;
349}
350
351static size_t ocfs2_xattr_list_acl_default(struct inode *inode,
352 char *list,
353 size_t list_len,
354 const char *name,
355 size_t name_len)
356{
357 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
358 const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
359
360 if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
361 return 0;
362
363 if (list && size <= list_len)
364 memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
365 return size;
366}
367
368static int ocfs2_xattr_get_acl(struct inode *inode,
369 int type,
370 void *buffer,
371 size_t size)
372{
373 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
374 struct posix_acl *acl;
375 int ret;
376
377 if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
378 return -EOPNOTSUPP;
379
380 acl = ocfs2_get_acl(inode, type);
381 if (IS_ERR(acl))
382 return PTR_ERR(acl);
383 if (acl == NULL)
384 return -ENODATA;
385 ret = posix_acl_to_xattr(acl, buffer, size);
386 posix_acl_release(acl);
387
388 return ret;
389}
390
391static int ocfs2_xattr_get_acl_access(struct inode *inode,
392 const char *name,
393 void *buffer,
394 size_t size)
395{
396 if (strcmp(name, "") != 0)
397 return -EINVAL;
398 return ocfs2_xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size);
399}
400
401static int ocfs2_xattr_get_acl_default(struct inode *inode,
402 const char *name,
403 void *buffer,
404 size_t size)
405{
406 if (strcmp(name, "") != 0)
407 return -EINVAL;
408 return ocfs2_xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size);
409}
410
411static int ocfs2_xattr_set_acl(struct inode *inode,
412 int type,
413 const void *value,
414 size_t size)
415{
416 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
417 struct posix_acl *acl;
418 int ret = 0;
419
420 if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
421 return -EOPNOTSUPP;
422
423 if (!is_owner_or_cap(inode))
424 return -EPERM;
425
426 if (value) {
427 acl = posix_acl_from_xattr(value, size);
428 if (IS_ERR(acl))
429 return PTR_ERR(acl);
430 else if (acl) {
431 ret = posix_acl_valid(acl);
432 if (ret)
433 goto cleanup;
434 }
435 } else
436 acl = NULL;
437
438 ret = ocfs2_set_acl(NULL, inode, NULL, type, acl, NULL, NULL);
439
440cleanup:
441 posix_acl_release(acl);
442 return ret;
443}
444
445static int ocfs2_xattr_set_acl_access(struct inode *inode,
446 const char *name,
447 const void *value,
448 size_t size,
449 int flags)
450{
451 if (strcmp(name, "") != 0)
452 return -EINVAL;
453 return ocfs2_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
454}
455
456static int ocfs2_xattr_set_acl_default(struct inode *inode,
457 const char *name,
458 const void *value,
459 size_t size,
460 int flags)
461{
462 if (strcmp(name, "") != 0)
463 return -EINVAL;
464 return ocfs2_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
465}
466
467struct xattr_handler ocfs2_xattr_acl_access_handler = {
468 .prefix = POSIX_ACL_XATTR_ACCESS,
469 .list = ocfs2_xattr_list_acl_access,
470 .get = ocfs2_xattr_get_acl_access,
471 .set = ocfs2_xattr_set_acl_access,
472};
473
474struct xattr_handler ocfs2_xattr_acl_default_handler = {
475 .prefix = POSIX_ACL_XATTR_DEFAULT,
476 .list = ocfs2_xattr_list_acl_default,
477 .get = ocfs2_xattr_get_acl_default,
478 .set = ocfs2_xattr_set_acl_default,
479};
diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h
new file mode 100644
index 000000000000..8f6389ed4da5
--- /dev/null
+++ b/fs/ocfs2/acl.h
@@ -0,0 +1,58 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * acl.h
5 *
6 * Copyright (C) 2004, 2008 Oracle. All rights reserved.
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public
10 * License version 2 as published by the Free Software Foundation.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License for more details.
16 */
17
18#ifndef OCFS2_ACL_H
19#define OCFS2_ACL_H
20
21#include <linux/posix_acl_xattr.h>
22
23struct ocfs2_acl_entry {
24 __le16 e_tag;
25 __le16 e_perm;
26 __le32 e_id;
27};
28
29#ifdef CONFIG_OCFS2_FS_POSIX_ACL
30
31extern int ocfs2_check_acl(struct inode *, int);
32extern int ocfs2_acl_chmod(struct inode *);
33extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *,
34 struct buffer_head *, struct buffer_head *,
35 struct ocfs2_alloc_context *,
36 struct ocfs2_alloc_context *);
37
38#else /* CONFIG_OCFS2_FS_POSIX_ACL*/
39
40#define ocfs2_check_acl NULL
41static inline int ocfs2_acl_chmod(struct inode *inode)
42{
43 return 0;
44}
45static inline int ocfs2_init_acl(handle_t *handle,
46 struct inode *inode,
47 struct inode *dir,
48 struct buffer_head *di_bh,
49 struct buffer_head *dir_bh,
50 struct ocfs2_alloc_context *meta_ac,
51 struct ocfs2_alloc_context *data_ac)
52{
53 return 0;
54}
55
56#endif /* CONFIG_OCFS2_FS_POSIX_ACL*/
57
58#endif /* OCFS2_ACL_H */
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 0cc2deb9394c..60fe74035db5 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -28,6 +28,7 @@
28#include <linux/slab.h> 28#include <linux/slab.h>
29#include <linux/highmem.h> 29#include <linux/highmem.h>
30#include <linux/swap.h> 30#include <linux/swap.h>
31#include <linux/quotaops.h>
31 32
32#define MLOG_MASK_PREFIX ML_DISK_ALLOC 33#define MLOG_MASK_PREFIX ML_DISK_ALLOC
33#include <cluster/masklog.h> 34#include <cluster/masklog.h>
@@ -36,6 +37,7 @@
36 37
37#include "alloc.h" 38#include "alloc.h"
38#include "aops.h" 39#include "aops.h"
40#include "blockcheck.h"
39#include "dlmglue.h" 41#include "dlmglue.h"
40#include "extent_map.h" 42#include "extent_map.h"
41#include "inode.h" 43#include "inode.h"
@@ -46,6 +48,7 @@
46#include "file.h" 48#include "file.h"
47#include "super.h" 49#include "super.h"
48#include "uptodate.h" 50#include "uptodate.h"
51#include "xattr.h"
49 52
50#include "buffer_head_io.h" 53#include "buffer_head_io.h"
51 54
@@ -187,20 +190,12 @@ static int ocfs2_dinode_insert_check(struct inode *inode,
187static int ocfs2_dinode_sanity_check(struct inode *inode, 190static int ocfs2_dinode_sanity_check(struct inode *inode,
188 struct ocfs2_extent_tree *et) 191 struct ocfs2_extent_tree *et)
189{ 192{
190 int ret = 0; 193 struct ocfs2_dinode *di = et->et_object;
191 struct ocfs2_dinode *di;
192 194
193 BUG_ON(et->et_ops != &ocfs2_dinode_et_ops); 195 BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
196 BUG_ON(!OCFS2_IS_VALID_DINODE(di));
194 197
195 di = et->et_object; 198 return 0;
196 if (!OCFS2_IS_VALID_DINODE(di)) {
197 ret = -EIO;
198 ocfs2_error(inode->i_sb,
199 "Inode %llu has invalid path root",
200 (unsigned long long)OCFS2_I(inode)->ip_blkno);
201 }
202
203 return ret;
204} 199}
205 200
206static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et) 201static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et)
@@ -213,36 +208,33 @@ static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et)
213 208
214static void ocfs2_xattr_value_fill_root_el(struct ocfs2_extent_tree *et) 209static void ocfs2_xattr_value_fill_root_el(struct ocfs2_extent_tree *et)
215{ 210{
216 struct ocfs2_xattr_value_root *xv = et->et_object; 211 struct ocfs2_xattr_value_buf *vb = et->et_object;
217 212
218 et->et_root_el = &xv->xr_list; 213 et->et_root_el = &vb->vb_xv->xr_list;
219} 214}
220 215
221static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et, 216static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et,
222 u64 blkno) 217 u64 blkno)
223{ 218{
224 struct ocfs2_xattr_value_root *xv = 219 struct ocfs2_xattr_value_buf *vb = et->et_object;
225 (struct ocfs2_xattr_value_root *)et->et_object;
226 220
227 xv->xr_last_eb_blk = cpu_to_le64(blkno); 221 vb->vb_xv->xr_last_eb_blk = cpu_to_le64(blkno);
228} 222}
229 223
230static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et) 224static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et)
231{ 225{
232 struct ocfs2_xattr_value_root *xv = 226 struct ocfs2_xattr_value_buf *vb = et->et_object;
233 (struct ocfs2_xattr_value_root *) et->et_object;
234 227
235 return le64_to_cpu(xv->xr_last_eb_blk); 228 return le64_to_cpu(vb->vb_xv->xr_last_eb_blk);
236} 229}
237 230
238static void ocfs2_xattr_value_update_clusters(struct inode *inode, 231static void ocfs2_xattr_value_update_clusters(struct inode *inode,
239 struct ocfs2_extent_tree *et, 232 struct ocfs2_extent_tree *et,
240 u32 clusters) 233 u32 clusters)
241{ 234{
242 struct ocfs2_xattr_value_root *xv = 235 struct ocfs2_xattr_value_buf *vb = et->et_object;
243 (struct ocfs2_xattr_value_root *)et->et_object;
244 236
245 le32_add_cpu(&xv->xr_clusters, clusters); 237 le32_add_cpu(&vb->vb_xv->xr_clusters, clusters);
246} 238}
247 239
248static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = { 240static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = {
@@ -304,11 +296,13 @@ static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
304static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et, 296static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
305 struct inode *inode, 297 struct inode *inode,
306 struct buffer_head *bh, 298 struct buffer_head *bh,
299 ocfs2_journal_access_func access,
307 void *obj, 300 void *obj,
308 struct ocfs2_extent_tree_operations *ops) 301 struct ocfs2_extent_tree_operations *ops)
309{ 302{
310 et->et_ops = ops; 303 et->et_ops = ops;
311 et->et_root_bh = bh; 304 et->et_root_bh = bh;
305 et->et_root_journal_access = access;
312 if (!obj) 306 if (!obj)
313 obj = (void *)bh->b_data; 307 obj = (void *)bh->b_data;
314 et->et_object = obj; 308 et->et_object = obj;
@@ -324,23 +318,23 @@ void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
324 struct inode *inode, 318 struct inode *inode,
325 struct buffer_head *bh) 319 struct buffer_head *bh)
326{ 320{
327 __ocfs2_init_extent_tree(et, inode, bh, NULL, &ocfs2_dinode_et_ops); 321 __ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_di,
322 NULL, &ocfs2_dinode_et_ops);
328} 323}
329 324
330void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et, 325void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
331 struct inode *inode, 326 struct inode *inode,
332 struct buffer_head *bh) 327 struct buffer_head *bh)
333{ 328{
334 __ocfs2_init_extent_tree(et, inode, bh, NULL, 329 __ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_xb,
335 &ocfs2_xattr_tree_et_ops); 330 NULL, &ocfs2_xattr_tree_et_ops);
336} 331}
337 332
338void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et, 333void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
339 struct inode *inode, 334 struct inode *inode,
340 struct buffer_head *bh, 335 struct ocfs2_xattr_value_buf *vb)
341 struct ocfs2_xattr_value_root *xv)
342{ 336{
343 __ocfs2_init_extent_tree(et, inode, bh, xv, 337 __ocfs2_init_extent_tree(et, inode, vb->vb_bh, vb->vb_access, vb,
344 &ocfs2_xattr_value_et_ops); 338 &ocfs2_xattr_value_et_ops);
345} 339}
346 340
@@ -362,6 +356,15 @@ static inline void ocfs2_et_update_clusters(struct inode *inode,
362 et->et_ops->eo_update_clusters(inode, et, clusters); 356 et->et_ops->eo_update_clusters(inode, et, clusters);
363} 357}
364 358
359static inline int ocfs2_et_root_journal_access(handle_t *handle,
360 struct inode *inode,
361 struct ocfs2_extent_tree *et,
362 int type)
363{
364 return et->et_root_journal_access(handle, inode, et->et_root_bh,
365 type);
366}
367
365static inline int ocfs2_et_insert_check(struct inode *inode, 368static inline int ocfs2_et_insert_check(struct inode *inode,
366 struct ocfs2_extent_tree *et, 369 struct ocfs2_extent_tree *et,
367 struct ocfs2_extent_rec *rec) 370 struct ocfs2_extent_rec *rec)
@@ -402,12 +405,14 @@ struct ocfs2_path_item {
402#define OCFS2_MAX_PATH_DEPTH 5 405#define OCFS2_MAX_PATH_DEPTH 5
403 406
404struct ocfs2_path { 407struct ocfs2_path {
405 int p_tree_depth; 408 int p_tree_depth;
406 struct ocfs2_path_item p_node[OCFS2_MAX_PATH_DEPTH]; 409 ocfs2_journal_access_func p_root_access;
410 struct ocfs2_path_item p_node[OCFS2_MAX_PATH_DEPTH];
407}; 411};
408 412
409#define path_root_bh(_path) ((_path)->p_node[0].bh) 413#define path_root_bh(_path) ((_path)->p_node[0].bh)
410#define path_root_el(_path) ((_path)->p_node[0].el) 414#define path_root_el(_path) ((_path)->p_node[0].el)
415#define path_root_access(_path)((_path)->p_root_access)
411#define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh) 416#define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
412#define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el) 417#define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
413#define path_num_items(_path) ((_path)->p_tree_depth + 1) 418#define path_num_items(_path) ((_path)->p_tree_depth + 1)
@@ -440,6 +445,8 @@ static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
440 */ 445 */
441 if (keep_root) 446 if (keep_root)
442 depth = le16_to_cpu(path_root_el(path)->l_tree_depth); 447 depth = le16_to_cpu(path_root_el(path)->l_tree_depth);
448 else
449 path_root_access(path) = NULL;
443 450
444 path->p_tree_depth = depth; 451 path->p_tree_depth = depth;
445} 452}
@@ -465,6 +472,7 @@ static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src)
465 472
466 BUG_ON(path_root_bh(dest) != path_root_bh(src)); 473 BUG_ON(path_root_bh(dest) != path_root_bh(src));
467 BUG_ON(path_root_el(dest) != path_root_el(src)); 474 BUG_ON(path_root_el(dest) != path_root_el(src));
475 BUG_ON(path_root_access(dest) != path_root_access(src));
468 476
469 ocfs2_reinit_path(dest, 1); 477 ocfs2_reinit_path(dest, 1);
470 478
@@ -486,6 +494,7 @@ static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src)
486 int i; 494 int i;
487 495
488 BUG_ON(path_root_bh(dest) != path_root_bh(src)); 496 BUG_ON(path_root_bh(dest) != path_root_bh(src));
497 BUG_ON(path_root_access(dest) != path_root_access(src));
489 498
490 for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) { 499 for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
491 brelse(dest->p_node[i].bh); 500 brelse(dest->p_node[i].bh);
@@ -521,7 +530,8 @@ static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index,
521} 530}
522 531
523static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh, 532static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
524 struct ocfs2_extent_list *root_el) 533 struct ocfs2_extent_list *root_el,
534 ocfs2_journal_access_func access)
525{ 535{
526 struct ocfs2_path *path; 536 struct ocfs2_path *path;
527 537
@@ -533,11 +543,48 @@ static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
533 get_bh(root_bh); 543 get_bh(root_bh);
534 path_root_bh(path) = root_bh; 544 path_root_bh(path) = root_bh;
535 path_root_el(path) = root_el; 545 path_root_el(path) = root_el;
546 path_root_access(path) = access;
536 } 547 }
537 548
538 return path; 549 return path;
539} 550}
540 551
552static struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path)
553{
554 return ocfs2_new_path(path_root_bh(path), path_root_el(path),
555 path_root_access(path));
556}
557
558static struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et)
559{
560 return ocfs2_new_path(et->et_root_bh, et->et_root_el,
561 et->et_root_journal_access);
562}
563
564/*
565 * Journal the buffer at depth idx. All idx>0 are extent_blocks,
566 * otherwise it's the root_access function.
567 *
568 * I don't like the way this function's name looks next to
569 * ocfs2_journal_access_path(), but I don't have a better one.
570 */
571static int ocfs2_path_bh_journal_access(handle_t *handle,
572 struct inode *inode,
573 struct ocfs2_path *path,
574 int idx)
575{
576 ocfs2_journal_access_func access = path_root_access(path);
577
578 if (!access)
579 access = ocfs2_journal_access;
580
581 if (idx)
582 access = ocfs2_journal_access_eb;
583
584 return access(handle, inode, path->p_node[idx].bh,
585 OCFS2_JOURNAL_ACCESS_WRITE);
586}
587
541/* 588/*
542 * Convenience function to journal all components in a path. 589 * Convenience function to journal all components in a path.
543 */ 590 */
@@ -550,8 +597,7 @@ static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle,
550 goto out; 597 goto out;
551 598
552 for(i = 0; i < path_num_items(path); i++) { 599 for(i = 0; i < path_num_items(path); i++) {
553 ret = ocfs2_journal_access(handle, inode, path->p_node[i].bh, 600 ret = ocfs2_path_bh_journal_access(handle, inode, path, i);
554 OCFS2_JOURNAL_ACCESS_WRITE);
555 if (ret < 0) { 601 if (ret < 0) {
556 mlog_errno(ret); 602 mlog_errno(ret);
557 goto out; 603 goto out;
@@ -686,6 +732,80 @@ struct ocfs2_merge_ctxt {
686 int c_split_covers_rec; 732 int c_split_covers_rec;
687}; 733};
688 734
735static int ocfs2_validate_extent_block(struct super_block *sb,
736 struct buffer_head *bh)
737{
738 int rc;
739 struct ocfs2_extent_block *eb =
740 (struct ocfs2_extent_block *)bh->b_data;
741
742 mlog(0, "Validating extent block %llu\n",
743 (unsigned long long)bh->b_blocknr);
744
745 BUG_ON(!buffer_uptodate(bh));
746
747 /*
748 * If the ecc fails, we return the error but otherwise
749 * leave the filesystem running. We know any error is
750 * local to this block.
751 */
752 rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &eb->h_check);
753 if (rc) {
754 mlog(ML_ERROR, "Checksum failed for extent block %llu\n",
755 (unsigned long long)bh->b_blocknr);
756 return rc;
757 }
758
759 /*
760 * Errors after here are fatal.
761 */
762
763 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
764 ocfs2_error(sb,
765 "Extent block #%llu has bad signature %.*s",
766 (unsigned long long)bh->b_blocknr, 7,
767 eb->h_signature);
768 return -EINVAL;
769 }
770
771 if (le64_to_cpu(eb->h_blkno) != bh->b_blocknr) {
772 ocfs2_error(sb,
773 "Extent block #%llu has an invalid h_blkno "
774 "of %llu",
775 (unsigned long long)bh->b_blocknr,
776 (unsigned long long)le64_to_cpu(eb->h_blkno));
777 return -EINVAL;
778 }
779
780 if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) {
781 ocfs2_error(sb,
782 "Extent block #%llu has an invalid "
783 "h_fs_generation of #%u",
784 (unsigned long long)bh->b_blocknr,
785 le32_to_cpu(eb->h_fs_generation));
786 return -EINVAL;
787 }
788
789 return 0;
790}
791
792int ocfs2_read_extent_block(struct inode *inode, u64 eb_blkno,
793 struct buffer_head **bh)
794{
795 int rc;
796 struct buffer_head *tmp = *bh;
797
798 rc = ocfs2_read_block(inode, eb_blkno, &tmp,
799 ocfs2_validate_extent_block);
800
801 /* If ocfs2_read_block() got us a new bh, pass it up. */
802 if (!rc && !*bh)
803 *bh = tmp;
804
805 return rc;
806}
807
808
689/* 809/*
690 * How many free extents have we got before we need more meta data? 810 * How many free extents have we got before we need more meta data?
691 */ 811 */
@@ -705,8 +825,7 @@ int ocfs2_num_free_extents(struct ocfs2_super *osb,
705 last_eb_blk = ocfs2_et_get_last_eb_blk(et); 825 last_eb_blk = ocfs2_et_get_last_eb_blk(et);
706 826
707 if (last_eb_blk) { 827 if (last_eb_blk) {
708 retval = ocfs2_read_block(inode, last_eb_blk, 828 retval = ocfs2_read_extent_block(inode, last_eb_blk, &eb_bh);
709 &eb_bh);
710 if (retval < 0) { 829 if (retval < 0) {
711 mlog_errno(retval); 830 mlog_errno(retval);
712 goto bail; 831 goto bail;
@@ -768,8 +887,8 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
768 } 887 }
769 ocfs2_set_new_buffer_uptodate(inode, bhs[i]); 888 ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
770 889
771 status = ocfs2_journal_access(handle, inode, bhs[i], 890 status = ocfs2_journal_access_eb(handle, inode, bhs[i],
772 OCFS2_JOURNAL_ACCESS_CREATE); 891 OCFS2_JOURNAL_ACCESS_CREATE);
773 if (status < 0) { 892 if (status < 0) {
774 mlog_errno(status); 893 mlog_errno(status);
775 goto bail; 894 goto bail;
@@ -908,15 +1027,12 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
908 for(i = 0; i < new_blocks; i++) { 1027 for(i = 0; i < new_blocks; i++) {
909 bh = new_eb_bhs[i]; 1028 bh = new_eb_bhs[i];
910 eb = (struct ocfs2_extent_block *) bh->b_data; 1029 eb = (struct ocfs2_extent_block *) bh->b_data;
911 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { 1030 /* ocfs2_create_new_meta_bhs() should create it right! */
912 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); 1031 BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
913 status = -EIO;
914 goto bail;
915 }
916 eb_el = &eb->h_list; 1032 eb_el = &eb->h_list;
917 1033
918 status = ocfs2_journal_access(handle, inode, bh, 1034 status = ocfs2_journal_access_eb(handle, inode, bh,
919 OCFS2_JOURNAL_ACCESS_CREATE); 1035 OCFS2_JOURNAL_ACCESS_CREATE);
920 if (status < 0) { 1036 if (status < 0) {
921 mlog_errno(status); 1037 mlog_errno(status);
922 goto bail; 1038 goto bail;
@@ -955,21 +1071,21 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
955 * journal_dirty erroring as it won't unless we've aborted the 1071 * journal_dirty erroring as it won't unless we've aborted the
956 * handle (in which case we would never be here) so reserving 1072 * handle (in which case we would never be here) so reserving
957 * the write with journal_access is all we need to do. */ 1073 * the write with journal_access is all we need to do. */
958 status = ocfs2_journal_access(handle, inode, *last_eb_bh, 1074 status = ocfs2_journal_access_eb(handle, inode, *last_eb_bh,
959 OCFS2_JOURNAL_ACCESS_WRITE); 1075 OCFS2_JOURNAL_ACCESS_WRITE);
960 if (status < 0) { 1076 if (status < 0) {
961 mlog_errno(status); 1077 mlog_errno(status);
962 goto bail; 1078 goto bail;
963 } 1079 }
964 status = ocfs2_journal_access(handle, inode, et->et_root_bh, 1080 status = ocfs2_et_root_journal_access(handle, inode, et,
965 OCFS2_JOURNAL_ACCESS_WRITE); 1081 OCFS2_JOURNAL_ACCESS_WRITE);
966 if (status < 0) { 1082 if (status < 0) {
967 mlog_errno(status); 1083 mlog_errno(status);
968 goto bail; 1084 goto bail;
969 } 1085 }
970 if (eb_bh) { 1086 if (eb_bh) {
971 status = ocfs2_journal_access(handle, inode, eb_bh, 1087 status = ocfs2_journal_access_eb(handle, inode, eb_bh,
972 OCFS2_JOURNAL_ACCESS_WRITE); 1088 OCFS2_JOURNAL_ACCESS_WRITE);
973 if (status < 0) { 1089 if (status < 0) {
974 mlog_errno(status); 1090 mlog_errno(status);
975 goto bail; 1091 goto bail;
@@ -1052,17 +1168,14 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
1052 } 1168 }
1053 1169
1054 eb = (struct ocfs2_extent_block *) new_eb_bh->b_data; 1170 eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
1055 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { 1171 /* ocfs2_create_new_meta_bhs() should create it right! */
1056 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); 1172 BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
1057 status = -EIO;
1058 goto bail;
1059 }
1060 1173
1061 eb_el = &eb->h_list; 1174 eb_el = &eb->h_list;
1062 root_el = et->et_root_el; 1175 root_el = et->et_root_el;
1063 1176
1064 status = ocfs2_journal_access(handle, inode, new_eb_bh, 1177 status = ocfs2_journal_access_eb(handle, inode, new_eb_bh,
1065 OCFS2_JOURNAL_ACCESS_CREATE); 1178 OCFS2_JOURNAL_ACCESS_CREATE);
1066 if (status < 0) { 1179 if (status < 0) {
1067 mlog_errno(status); 1180 mlog_errno(status);
1068 goto bail; 1181 goto bail;
@@ -1080,8 +1193,8 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
1080 goto bail; 1193 goto bail;
1081 } 1194 }
1082 1195
1083 status = ocfs2_journal_access(handle, inode, et->et_root_bh, 1196 status = ocfs2_et_root_journal_access(handle, inode, et,
1084 OCFS2_JOURNAL_ACCESS_WRITE); 1197 OCFS2_JOURNAL_ACCESS_WRITE);
1085 if (status < 0) { 1198 if (status < 0) {
1086 mlog_errno(status); 1199 mlog_errno(status);
1087 goto bail; 1200 goto bail;
@@ -1176,18 +1289,13 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb,
1176 brelse(bh); 1289 brelse(bh);
1177 bh = NULL; 1290 bh = NULL;
1178 1291
1179 status = ocfs2_read_block(inode, blkno, &bh); 1292 status = ocfs2_read_extent_block(inode, blkno, &bh);
1180 if (status < 0) { 1293 if (status < 0) {
1181 mlog_errno(status); 1294 mlog_errno(status);
1182 goto bail; 1295 goto bail;
1183 } 1296 }
1184 1297
1185 eb = (struct ocfs2_extent_block *) bh->b_data; 1298 eb = (struct ocfs2_extent_block *) bh->b_data;
1186 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1187 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1188 status = -EIO;
1189 goto bail;
1190 }
1191 el = &eb->h_list; 1299 el = &eb->h_list;
1192 1300
1193 if (le16_to_cpu(el->l_next_free_rec) < 1301 if (le16_to_cpu(el->l_next_free_rec) <
@@ -1540,7 +1648,7 @@ static int __ocfs2_find_path(struct inode *inode,
1540 1648
1541 brelse(bh); 1649 brelse(bh);
1542 bh = NULL; 1650 bh = NULL;
1543 ret = ocfs2_read_block(inode, blkno, &bh); 1651 ret = ocfs2_read_extent_block(inode, blkno, &bh);
1544 if (ret) { 1652 if (ret) {
1545 mlog_errno(ret); 1653 mlog_errno(ret);
1546 goto out; 1654 goto out;
@@ -1548,11 +1656,6 @@ static int __ocfs2_find_path(struct inode *inode,
1548 1656
1549 eb = (struct ocfs2_extent_block *) bh->b_data; 1657 eb = (struct ocfs2_extent_block *) bh->b_data;
1550 el = &eb->h_list; 1658 el = &eb->h_list;
1551 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1552 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1553 ret = -EIO;
1554 goto out;
1555 }
1556 1659
1557 if (le16_to_cpu(el->l_next_free_rec) > 1660 if (le16_to_cpu(el->l_next_free_rec) >
1558 le16_to_cpu(el->l_count)) { 1661 le16_to_cpu(el->l_count)) {
@@ -1860,25 +1963,23 @@ static int ocfs2_rotate_subtree_right(struct inode *inode,
1860 root_bh = left_path->p_node[subtree_index].bh; 1963 root_bh = left_path->p_node[subtree_index].bh;
1861 BUG_ON(root_bh != right_path->p_node[subtree_index].bh); 1964 BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
1862 1965
1863 ret = ocfs2_journal_access(handle, inode, root_bh, 1966 ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
1864 OCFS2_JOURNAL_ACCESS_WRITE); 1967 subtree_index);
1865 if (ret) { 1968 if (ret) {
1866 mlog_errno(ret); 1969 mlog_errno(ret);
1867 goto out; 1970 goto out;
1868 } 1971 }
1869 1972
1870 for(i = subtree_index + 1; i < path_num_items(right_path); i++) { 1973 for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
1871 ret = ocfs2_journal_access(handle, inode, 1974 ret = ocfs2_path_bh_journal_access(handle, inode,
1872 right_path->p_node[i].bh, 1975 right_path, i);
1873 OCFS2_JOURNAL_ACCESS_WRITE);
1874 if (ret) { 1976 if (ret) {
1875 mlog_errno(ret); 1977 mlog_errno(ret);
1876 goto out; 1978 goto out;
1877 } 1979 }
1878 1980
1879 ret = ocfs2_journal_access(handle, inode, 1981 ret = ocfs2_path_bh_journal_access(handle, inode,
1880 left_path->p_node[i].bh, 1982 left_path, i);
1881 OCFS2_JOURNAL_ACCESS_WRITE);
1882 if (ret) { 1983 if (ret) {
1883 mlog_errno(ret); 1984 mlog_errno(ret);
1884 goto out; 1985 goto out;
@@ -2102,8 +2203,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
2102 2203
2103 *ret_left_path = NULL; 2204 *ret_left_path = NULL;
2104 2205
2105 left_path = ocfs2_new_path(path_root_bh(right_path), 2206 left_path = ocfs2_new_path_from_path(right_path);
2106 path_root_el(right_path));
2107 if (!left_path) { 2207 if (!left_path) {
2108 ret = -ENOMEM; 2208 ret = -ENOMEM;
2109 mlog_errno(ret); 2209 mlog_errno(ret);
@@ -2398,9 +2498,9 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
2398 return -EAGAIN; 2498 return -EAGAIN;
2399 2499
2400 if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) { 2500 if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) {
2401 ret = ocfs2_journal_access(handle, inode, 2501 ret = ocfs2_journal_access_eb(handle, inode,
2402 path_leaf_bh(right_path), 2502 path_leaf_bh(right_path),
2403 OCFS2_JOURNAL_ACCESS_WRITE); 2503 OCFS2_JOURNAL_ACCESS_WRITE);
2404 if (ret) { 2504 if (ret) {
2405 mlog_errno(ret); 2505 mlog_errno(ret);
2406 goto out; 2506 goto out;
@@ -2417,8 +2517,8 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
2417 * We have to update i_last_eb_blk during the meta 2517 * We have to update i_last_eb_blk during the meta
2418 * data delete. 2518 * data delete.
2419 */ 2519 */
2420 ret = ocfs2_journal_access(handle, inode, et_root_bh, 2520 ret = ocfs2_et_root_journal_access(handle, inode, et,
2421 OCFS2_JOURNAL_ACCESS_WRITE); 2521 OCFS2_JOURNAL_ACCESS_WRITE);
2422 if (ret) { 2522 if (ret) {
2423 mlog_errno(ret); 2523 mlog_errno(ret);
2424 goto out; 2524 goto out;
@@ -2433,25 +2533,23 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
2433 */ 2533 */
2434 BUG_ON(right_has_empty && !del_right_subtree); 2534 BUG_ON(right_has_empty && !del_right_subtree);
2435 2535
2436 ret = ocfs2_journal_access(handle, inode, root_bh, 2536 ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
2437 OCFS2_JOURNAL_ACCESS_WRITE); 2537 subtree_index);
2438 if (ret) { 2538 if (ret) {
2439 mlog_errno(ret); 2539 mlog_errno(ret);
2440 goto out; 2540 goto out;
2441 } 2541 }
2442 2542
2443 for(i = subtree_index + 1; i < path_num_items(right_path); i++) { 2543 for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
2444 ret = ocfs2_journal_access(handle, inode, 2544 ret = ocfs2_path_bh_journal_access(handle, inode,
2445 right_path->p_node[i].bh, 2545 right_path, i);
2446 OCFS2_JOURNAL_ACCESS_WRITE);
2447 if (ret) { 2546 if (ret) {
2448 mlog_errno(ret); 2547 mlog_errno(ret);
2449 goto out; 2548 goto out;
2450 } 2549 }
2451 2550
2452 ret = ocfs2_journal_access(handle, inode, 2551 ret = ocfs2_path_bh_journal_access(handle, inode,
2453 left_path->p_node[i].bh, 2552 left_path, i);
2454 OCFS2_JOURNAL_ACCESS_WRITE);
2455 if (ret) { 2553 if (ret) {
2456 mlog_errno(ret); 2554 mlog_errno(ret);
2457 goto out; 2555 goto out;
@@ -2596,16 +2694,17 @@ out:
2596 2694
2597static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode, 2695static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode,
2598 handle_t *handle, 2696 handle_t *handle,
2599 struct buffer_head *bh, 2697 struct ocfs2_path *path)
2600 struct ocfs2_extent_list *el)
2601{ 2698{
2602 int ret; 2699 int ret;
2700 struct buffer_head *bh = path_leaf_bh(path);
2701 struct ocfs2_extent_list *el = path_leaf_el(path);
2603 2702
2604 if (!ocfs2_is_empty_extent(&el->l_recs[0])) 2703 if (!ocfs2_is_empty_extent(&el->l_recs[0]))
2605 return 0; 2704 return 0;
2606 2705
2607 ret = ocfs2_journal_access(handle, inode, bh, 2706 ret = ocfs2_path_bh_journal_access(handle, inode, path,
2608 OCFS2_JOURNAL_ACCESS_WRITE); 2707 path_num_items(path) - 1);
2609 if (ret) { 2708 if (ret) {
2610 mlog_errno(ret); 2709 mlog_errno(ret);
2611 goto out; 2710 goto out;
@@ -2644,8 +2743,7 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
2644 goto out; 2743 goto out;
2645 } 2744 }
2646 2745
2647 left_path = ocfs2_new_path(path_root_bh(path), 2746 left_path = ocfs2_new_path_from_path(path);
2648 path_root_el(path));
2649 if (!left_path) { 2747 if (!left_path) {
2650 ret = -ENOMEM; 2748 ret = -ENOMEM;
2651 mlog_errno(ret); 2749 mlog_errno(ret);
@@ -2654,8 +2752,7 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
2654 2752
2655 ocfs2_cp_path(left_path, path); 2753 ocfs2_cp_path(left_path, path);
2656 2754
2657 right_path = ocfs2_new_path(path_root_bh(path), 2755 right_path = ocfs2_new_path_from_path(path);
2658 path_root_el(path));
2659 if (!right_path) { 2756 if (!right_path) {
2660 ret = -ENOMEM; 2757 ret = -ENOMEM;
2661 mlog_errno(ret); 2758 mlog_errno(ret);
@@ -2689,9 +2786,8 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
2689 * Caller might still want to make changes to the 2786 * Caller might still want to make changes to the
2690 * tree root, so re-add it to the journal here. 2787 * tree root, so re-add it to the journal here.
2691 */ 2788 */
2692 ret = ocfs2_journal_access(handle, inode, 2789 ret = ocfs2_path_bh_journal_access(handle, inode,
2693 path_root_bh(left_path), 2790 left_path, 0);
2694 OCFS2_JOURNAL_ACCESS_WRITE);
2695 if (ret) { 2791 if (ret) {
2696 mlog_errno(ret); 2792 mlog_errno(ret);
2697 goto out; 2793 goto out;
@@ -2785,8 +2881,7 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
2785 * We have a path to the left of this one - it needs 2881 * We have a path to the left of this one - it needs
2786 * an update too. 2882 * an update too.
2787 */ 2883 */
2788 left_path = ocfs2_new_path(path_root_bh(path), 2884 left_path = ocfs2_new_path_from_path(path);
2789 path_root_el(path));
2790 if (!left_path) { 2885 if (!left_path) {
2791 ret = -ENOMEM; 2886 ret = -ENOMEM;
2792 mlog_errno(ret); 2887 mlog_errno(ret);
@@ -2875,8 +2970,7 @@ rightmost_no_delete:
2875 * it up front. 2970 * it up front.
2876 */ 2971 */
2877 ret = ocfs2_rotate_rightmost_leaf_left(inode, handle, 2972 ret = ocfs2_rotate_rightmost_leaf_left(inode, handle,
2878 path_leaf_bh(path), 2973 path);
2879 path_leaf_el(path));
2880 if (ret) 2974 if (ret)
2881 mlog_errno(ret); 2975 mlog_errno(ret);
2882 goto out; 2976 goto out;
@@ -3027,8 +3121,7 @@ static int ocfs2_get_right_path(struct inode *inode,
3027 /* This function shouldn't be called for the rightmost leaf. */ 3121 /* This function shouldn't be called for the rightmost leaf. */
3028 BUG_ON(right_cpos == 0); 3122 BUG_ON(right_cpos == 0);
3029 3123
3030 right_path = ocfs2_new_path(path_root_bh(left_path), 3124 right_path = ocfs2_new_path_from_path(left_path);
3031 path_root_el(left_path));
3032 if (!right_path) { 3125 if (!right_path) {
3033 ret = -ENOMEM; 3126 ret = -ENOMEM;
3034 mlog_errno(ret); 3127 mlog_errno(ret);
@@ -3111,8 +3204,8 @@ static int ocfs2_merge_rec_right(struct inode *inode,
3111 root_bh = left_path->p_node[subtree_index].bh; 3204 root_bh = left_path->p_node[subtree_index].bh;
3112 BUG_ON(root_bh != right_path->p_node[subtree_index].bh); 3205 BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
3113 3206
3114 ret = ocfs2_journal_access(handle, inode, root_bh, 3207 ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
3115 OCFS2_JOURNAL_ACCESS_WRITE); 3208 subtree_index);
3116 if (ret) { 3209 if (ret) {
3117 mlog_errno(ret); 3210 mlog_errno(ret);
3118 goto out; 3211 goto out;
@@ -3120,17 +3213,15 @@ static int ocfs2_merge_rec_right(struct inode *inode,
3120 3213
3121 for (i = subtree_index + 1; 3214 for (i = subtree_index + 1;
3122 i < path_num_items(right_path); i++) { 3215 i < path_num_items(right_path); i++) {
3123 ret = ocfs2_journal_access(handle, inode, 3216 ret = ocfs2_path_bh_journal_access(handle, inode,
3124 right_path->p_node[i].bh, 3217 right_path, i);
3125 OCFS2_JOURNAL_ACCESS_WRITE);
3126 if (ret) { 3218 if (ret) {
3127 mlog_errno(ret); 3219 mlog_errno(ret);
3128 goto out; 3220 goto out;
3129 } 3221 }
3130 3222
3131 ret = ocfs2_journal_access(handle, inode, 3223 ret = ocfs2_path_bh_journal_access(handle, inode,
3132 left_path->p_node[i].bh, 3224 left_path, i);
3133 OCFS2_JOURNAL_ACCESS_WRITE);
3134 if (ret) { 3225 if (ret) {
3135 mlog_errno(ret); 3226 mlog_errno(ret);
3136 goto out; 3227 goto out;
@@ -3142,8 +3233,8 @@ static int ocfs2_merge_rec_right(struct inode *inode,
3142 right_rec = &el->l_recs[index + 1]; 3233 right_rec = &el->l_recs[index + 1];
3143 } 3234 }
3144 3235
3145 ret = ocfs2_journal_access(handle, inode, bh, 3236 ret = ocfs2_path_bh_journal_access(handle, inode, left_path,
3146 OCFS2_JOURNAL_ACCESS_WRITE); 3237 path_num_items(left_path) - 1);
3147 if (ret) { 3238 if (ret) {
3148 mlog_errno(ret); 3239 mlog_errno(ret);
3149 goto out; 3240 goto out;
@@ -3199,8 +3290,7 @@ static int ocfs2_get_left_path(struct inode *inode,
3199 /* This function shouldn't be called for the leftmost leaf. */ 3290 /* This function shouldn't be called for the leftmost leaf. */
3200 BUG_ON(left_cpos == 0); 3291 BUG_ON(left_cpos == 0);
3201 3292
3202 left_path = ocfs2_new_path(path_root_bh(right_path), 3293 left_path = ocfs2_new_path_from_path(right_path);
3203 path_root_el(right_path));
3204 if (!left_path) { 3294 if (!left_path) {
3205 ret = -ENOMEM; 3295 ret = -ENOMEM;
3206 mlog_errno(ret); 3296 mlog_errno(ret);
@@ -3283,8 +3373,8 @@ static int ocfs2_merge_rec_left(struct inode *inode,
3283 root_bh = left_path->p_node[subtree_index].bh; 3373 root_bh = left_path->p_node[subtree_index].bh;
3284 BUG_ON(root_bh != right_path->p_node[subtree_index].bh); 3374 BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
3285 3375
3286 ret = ocfs2_journal_access(handle, inode, root_bh, 3376 ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
3287 OCFS2_JOURNAL_ACCESS_WRITE); 3377 subtree_index);
3288 if (ret) { 3378 if (ret) {
3289 mlog_errno(ret); 3379 mlog_errno(ret);
3290 goto out; 3380 goto out;
@@ -3292,17 +3382,15 @@ static int ocfs2_merge_rec_left(struct inode *inode,
3292 3382
3293 for (i = subtree_index + 1; 3383 for (i = subtree_index + 1;
3294 i < path_num_items(right_path); i++) { 3384 i < path_num_items(right_path); i++) {
3295 ret = ocfs2_journal_access(handle, inode, 3385 ret = ocfs2_path_bh_journal_access(handle, inode,
3296 right_path->p_node[i].bh, 3386 right_path, i);
3297 OCFS2_JOURNAL_ACCESS_WRITE);
3298 if (ret) { 3387 if (ret) {
3299 mlog_errno(ret); 3388 mlog_errno(ret);
3300 goto out; 3389 goto out;
3301 } 3390 }
3302 3391
3303 ret = ocfs2_journal_access(handle, inode, 3392 ret = ocfs2_path_bh_journal_access(handle, inode,
3304 left_path->p_node[i].bh, 3393 left_path, i);
3305 OCFS2_JOURNAL_ACCESS_WRITE);
3306 if (ret) { 3394 if (ret) {
3307 mlog_errno(ret); 3395 mlog_errno(ret);
3308 goto out; 3396 goto out;
@@ -3314,8 +3402,8 @@ static int ocfs2_merge_rec_left(struct inode *inode,
3314 has_empty_extent = 1; 3402 has_empty_extent = 1;
3315 } 3403 }
3316 3404
3317 ret = ocfs2_journal_access(handle, inode, bh, 3405 ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
3318 OCFS2_JOURNAL_ACCESS_WRITE); 3406 path_num_items(right_path) - 1);
3319 if (ret) { 3407 if (ret) {
3320 mlog_errno(ret); 3408 mlog_errno(ret);
3321 goto out; 3409 goto out;
@@ -3732,8 +3820,7 @@ static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
3732 * leftmost leaf. 3820 * leftmost leaf.
3733 */ 3821 */
3734 if (left_cpos) { 3822 if (left_cpos) {
3735 left_path = ocfs2_new_path(path_root_bh(right_path), 3823 left_path = ocfs2_new_path_from_path(right_path);
3736 path_root_el(right_path));
3737 if (!left_path) { 3824 if (!left_path) {
3738 ret = -ENOMEM; 3825 ret = -ENOMEM;
3739 mlog_errno(ret); 3826 mlog_errno(ret);
@@ -3781,7 +3868,7 @@ static void ocfs2_split_record(struct inode *inode,
3781 struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el; 3868 struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el;
3782 struct ocfs2_extent_rec *rec, *tmprec; 3869 struct ocfs2_extent_rec *rec, *tmprec;
3783 3870
3784 right_el = path_leaf_el(right_path);; 3871 right_el = path_leaf_el(right_path);
3785 if (left_path) 3872 if (left_path)
3786 left_el = path_leaf_el(left_path); 3873 left_el = path_leaf_el(left_path);
3787 3874
@@ -3958,8 +4045,8 @@ static int ocfs2_do_insert_extent(struct inode *inode,
3958 4045
3959 el = et->et_root_el; 4046 el = et->et_root_el;
3960 4047
3961 ret = ocfs2_journal_access(handle, inode, et->et_root_bh, 4048 ret = ocfs2_et_root_journal_access(handle, inode, et,
3962 OCFS2_JOURNAL_ACCESS_WRITE); 4049 OCFS2_JOURNAL_ACCESS_WRITE);
3963 if (ret) { 4050 if (ret) {
3964 mlog_errno(ret); 4051 mlog_errno(ret);
3965 goto out; 4052 goto out;
@@ -3970,7 +4057,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
3970 goto out_update_clusters; 4057 goto out_update_clusters;
3971 } 4058 }
3972 4059
3973 right_path = ocfs2_new_path(et->et_root_bh, et->et_root_el); 4060 right_path = ocfs2_new_path_from_et(et);
3974 if (!right_path) { 4061 if (!right_path) {
3975 ret = -ENOMEM; 4062 ret = -ENOMEM;
3976 mlog_errno(ret); 4063 mlog_errno(ret);
@@ -4020,8 +4107,8 @@ static int ocfs2_do_insert_extent(struct inode *inode,
4020 * ocfs2_rotate_tree_right() might have extended the 4107 * ocfs2_rotate_tree_right() might have extended the
4021 * transaction without re-journaling our tree root. 4108 * transaction without re-journaling our tree root.
4022 */ 4109 */
4023 ret = ocfs2_journal_access(handle, inode, et->et_root_bh, 4110 ret = ocfs2_et_root_journal_access(handle, inode, et,
4024 OCFS2_JOURNAL_ACCESS_WRITE); 4111 OCFS2_JOURNAL_ACCESS_WRITE);
4025 if (ret) { 4112 if (ret) {
4026 mlog_errno(ret); 4113 mlog_errno(ret);
4027 goto out; 4114 goto out;
@@ -4082,8 +4169,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
4082 goto out; 4169 goto out;
4083 4170
4084 if (left_cpos != 0) { 4171 if (left_cpos != 0) {
4085 left_path = ocfs2_new_path(path_root_bh(path), 4172 left_path = ocfs2_new_path_from_path(path);
4086 path_root_el(path));
4087 if (!left_path) 4173 if (!left_path)
4088 goto out; 4174 goto out;
4089 4175
@@ -4097,8 +4183,15 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
4097 le16_to_cpu(new_el->l_count)) { 4183 le16_to_cpu(new_el->l_count)) {
4098 bh = path_leaf_bh(left_path); 4184 bh = path_leaf_bh(left_path);
4099 eb = (struct ocfs2_extent_block *)bh->b_data; 4185 eb = (struct ocfs2_extent_block *)bh->b_data;
4100 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, 4186 ocfs2_error(inode->i_sb,
4101 eb); 4187 "Extent block #%llu has an "
4188 "invalid l_next_free_rec of "
4189 "%d. It should have "
4190 "matched the l_count of %d",
4191 (unsigned long long)le64_to_cpu(eb->h_blkno),
4192 le16_to_cpu(new_el->l_next_free_rec),
4193 le16_to_cpu(new_el->l_count));
4194 status = -EINVAL;
4102 goto out; 4195 goto out;
4103 } 4196 }
4104 rec = &new_el->l_recs[ 4197 rec = &new_el->l_recs[
@@ -4132,8 +4225,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
4132 if (right_cpos == 0) 4225 if (right_cpos == 0)
4133 goto out; 4226 goto out;
4134 4227
4135 right_path = ocfs2_new_path(path_root_bh(path), 4228 right_path = ocfs2_new_path_from_path(path);
4136 path_root_el(path));
4137 if (!right_path) 4229 if (!right_path)
4138 goto out; 4230 goto out;
4139 4231
@@ -4147,8 +4239,12 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
4147 if (le16_to_cpu(new_el->l_next_free_rec) <= 1) { 4239 if (le16_to_cpu(new_el->l_next_free_rec) <= 1) {
4148 bh = path_leaf_bh(right_path); 4240 bh = path_leaf_bh(right_path);
4149 eb = (struct ocfs2_extent_block *)bh->b_data; 4241 eb = (struct ocfs2_extent_block *)bh->b_data;
4150 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, 4242 ocfs2_error(inode->i_sb,
4151 eb); 4243 "Extent block #%llu has an "
4244 "invalid l_next_free_rec of %d",
4245 (unsigned long long)le64_to_cpu(eb->h_blkno),
4246 le16_to_cpu(new_el->l_next_free_rec));
4247 status = -EINVAL;
4152 goto out; 4248 goto out;
4153 } 4249 }
4154 rec = &new_el->l_recs[1]; 4250 rec = &new_el->l_recs[1];
@@ -4294,7 +4390,9 @@ static int ocfs2_figure_insert_type(struct inode *inode,
4294 * ocfs2_figure_insert_type() and ocfs2_add_branch() 4390 * ocfs2_figure_insert_type() and ocfs2_add_branch()
4295 * may want it later. 4391 * may want it later.
4296 */ 4392 */
4297 ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et), &bh); 4393 ret = ocfs2_read_extent_block(inode,
4394 ocfs2_et_get_last_eb_blk(et),
4395 &bh);
4298 if (ret) { 4396 if (ret) {
4299 mlog_exit(ret); 4397 mlog_exit(ret);
4300 goto out; 4398 goto out;
@@ -4320,7 +4418,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
4320 return 0; 4418 return 0;
4321 } 4419 }
4322 4420
4323 path = ocfs2_new_path(et->et_root_bh, et->et_root_el); 4421 path = ocfs2_new_path_from_et(et);
4324 if (!path) { 4422 if (!path) {
4325 ret = -ENOMEM; 4423 ret = -ENOMEM;
4326 mlog_errno(ret); 4424 mlog_errno(ret);
@@ -4531,9 +4629,9 @@ int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
4531 4629
4532 BUG_ON(num_bits > clusters_to_add); 4630 BUG_ON(num_bits > clusters_to_add);
4533 4631
4534 /* reserve our write early -- insert_extent may update the inode */ 4632 /* reserve our write early -- insert_extent may update the tree root */
4535 status = ocfs2_journal_access(handle, inode, et->et_root_bh, 4633 status = ocfs2_et_root_journal_access(handle, inode, et,
4536 OCFS2_JOURNAL_ACCESS_WRITE); 4634 OCFS2_JOURNAL_ACCESS_WRITE);
4537 if (status < 0) { 4635 if (status < 0) {
4538 mlog_errno(status); 4636 mlog_errno(status);
4539 goto leave; 4637 goto leave;
@@ -4760,20 +4858,15 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
4760 if (path->p_tree_depth) { 4858 if (path->p_tree_depth) {
4761 struct ocfs2_extent_block *eb; 4859 struct ocfs2_extent_block *eb;
4762 4860
4763 ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et), 4861 ret = ocfs2_read_extent_block(inode,
4764 &last_eb_bh); 4862 ocfs2_et_get_last_eb_blk(et),
4863 &last_eb_bh);
4765 if (ret) { 4864 if (ret) {
4766 mlog_exit(ret); 4865 mlog_exit(ret);
4767 goto out; 4866 goto out;
4768 } 4867 }
4769 4868
4770 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; 4869 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
4771 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
4772 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
4773 ret = -EROFS;
4774 goto out;
4775 }
4776
4777 rightmost_el = &eb->h_list; 4870 rightmost_el = &eb->h_list;
4778 } else 4871 } else
4779 rightmost_el = path_root_el(path); 4872 rightmost_el = path_root_el(path);
@@ -4854,7 +4947,7 @@ int ocfs2_mark_extent_written(struct inode *inode,
4854 if (et->et_ops == &ocfs2_dinode_et_ops) 4947 if (et->et_ops == &ocfs2_dinode_et_ops)
4855 ocfs2_extent_map_trunc(inode, 0); 4948 ocfs2_extent_map_trunc(inode, 0);
4856 4949
4857 left_path = ocfs2_new_path(et->et_root_bh, et->et_root_el); 4950 left_path = ocfs2_new_path_from_et(et);
4858 if (!left_path) { 4951 if (!left_path) {
4859 ret = -ENOMEM; 4952 ret = -ENOMEM;
4860 mlog_errno(ret); 4953 mlog_errno(ret);
@@ -4918,8 +5011,9 @@ static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et,
4918 5011
4919 depth = path->p_tree_depth; 5012 depth = path->p_tree_depth;
4920 if (depth > 0) { 5013 if (depth > 0) {
4921 ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et), 5014 ret = ocfs2_read_extent_block(inode,
4922 &last_eb_bh); 5015 ocfs2_et_get_last_eb_blk(et),
5016 &last_eb_bh);
4923 if (ret < 0) { 5017 if (ret < 0) {
4924 mlog_errno(ret); 5018 mlog_errno(ret);
4925 goto out; 5019 goto out;
@@ -5025,8 +5119,7 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
5025 } 5119 }
5026 5120
5027 if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) { 5121 if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) {
5028 left_path = ocfs2_new_path(path_root_bh(path), 5122 left_path = ocfs2_new_path_from_path(path);
5029 path_root_el(path));
5030 if (!left_path) { 5123 if (!left_path) {
5031 ret = -ENOMEM; 5124 ret = -ENOMEM;
5032 mlog_errno(ret); 5125 mlog_errno(ret);
@@ -5135,7 +5228,7 @@ int ocfs2_remove_extent(struct inode *inode,
5135 5228
5136 ocfs2_extent_map_trunc(inode, 0); 5229 ocfs2_extent_map_trunc(inode, 0);
5137 5230
5138 path = ocfs2_new_path(et->et_root_bh, et->et_root_el); 5231 path = ocfs2_new_path_from_et(et);
5139 if (!path) { 5232 if (!path) {
5140 ret = -ENOMEM; 5233 ret = -ENOMEM;
5141 mlog_errno(ret); 5234 mlog_errno(ret);
@@ -5255,6 +5348,81 @@ out:
5255 return ret; 5348 return ret;
5256} 5349}
5257 5350
5351int ocfs2_remove_btree_range(struct inode *inode,
5352 struct ocfs2_extent_tree *et,
5353 u32 cpos, u32 phys_cpos, u32 len,
5354 struct ocfs2_cached_dealloc_ctxt *dealloc)
5355{
5356 int ret;
5357 u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
5358 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
5359 struct inode *tl_inode = osb->osb_tl_inode;
5360 handle_t *handle;
5361 struct ocfs2_alloc_context *meta_ac = NULL;
5362
5363 ret = ocfs2_lock_allocators(inode, et, 0, 1, NULL, &meta_ac);
5364 if (ret) {
5365 mlog_errno(ret);
5366 return ret;
5367 }
5368
5369 mutex_lock(&tl_inode->i_mutex);
5370
5371 if (ocfs2_truncate_log_needs_flush(osb)) {
5372 ret = __ocfs2_flush_truncate_log(osb);
5373 if (ret < 0) {
5374 mlog_errno(ret);
5375 goto out;
5376 }
5377 }
5378
5379 handle = ocfs2_start_trans(osb, ocfs2_remove_extent_credits(osb->sb));
5380 if (IS_ERR(handle)) {
5381 ret = PTR_ERR(handle);
5382 mlog_errno(ret);
5383 goto out;
5384 }
5385
5386 ret = ocfs2_et_root_journal_access(handle, inode, et,
5387 OCFS2_JOURNAL_ACCESS_WRITE);
5388 if (ret) {
5389 mlog_errno(ret);
5390 goto out;
5391 }
5392
5393 vfs_dq_free_space_nodirty(inode,
5394 ocfs2_clusters_to_bytes(inode->i_sb, len));
5395
5396 ret = ocfs2_remove_extent(inode, et, cpos, len, handle, meta_ac,
5397 dealloc);
5398 if (ret) {
5399 mlog_errno(ret);
5400 goto out_commit;
5401 }
5402
5403 ocfs2_et_update_clusters(inode, et, -len);
5404
5405 ret = ocfs2_journal_dirty(handle, et->et_root_bh);
5406 if (ret) {
5407 mlog_errno(ret);
5408 goto out_commit;
5409 }
5410
5411 ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
5412 if (ret)
5413 mlog_errno(ret);
5414
5415out_commit:
5416 ocfs2_commit_trans(osb, handle);
5417out:
5418 mutex_unlock(&tl_inode->i_mutex);
5419
5420 if (meta_ac)
5421 ocfs2_free_alloc_context(meta_ac);
5422
5423 return ret;
5424}
5425
5258int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb) 5426int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
5259{ 5427{
5260 struct buffer_head *tl_bh = osb->osb_tl_bh; 5428 struct buffer_head *tl_bh = osb->osb_tl_bh;
@@ -5308,13 +5476,13 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
5308 start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk); 5476 start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
5309 5477
5310 di = (struct ocfs2_dinode *) tl_bh->b_data; 5478 di = (struct ocfs2_dinode *) tl_bh->b_data;
5311 tl = &di->id2.i_dealloc;
5312 if (!OCFS2_IS_VALID_DINODE(di)) {
5313 OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
5314 status = -EIO;
5315 goto bail;
5316 }
5317 5479
5480 /* tl_bh is loaded from ocfs2_truncate_log_init(). It's validated
5481 * by the underlying call to ocfs2_read_inode_block(), so any
5482 * corruption is a code bug */
5483 BUG_ON(!OCFS2_IS_VALID_DINODE(di));
5484
5485 tl = &di->id2.i_dealloc;
5318 tl_count = le16_to_cpu(tl->tl_count); 5486 tl_count = le16_to_cpu(tl->tl_count);
5319 mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) || 5487 mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
5320 tl_count == 0, 5488 tl_count == 0,
@@ -5332,8 +5500,8 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
5332 goto bail; 5500 goto bail;
5333 } 5501 }
5334 5502
5335 status = ocfs2_journal_access(handle, tl_inode, tl_bh, 5503 status = ocfs2_journal_access_di(handle, tl_inode, tl_bh,
5336 OCFS2_JOURNAL_ACCESS_WRITE); 5504 OCFS2_JOURNAL_ACCESS_WRITE);
5337 if (status < 0) { 5505 if (status < 0) {
5338 mlog_errno(status); 5506 mlog_errno(status);
5339 goto bail; 5507 goto bail;
@@ -5394,8 +5562,8 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
5394 while (i >= 0) { 5562 while (i >= 0) {
5395 /* Caller has given us at least enough credits to 5563 /* Caller has given us at least enough credits to
5396 * update the truncate log dinode */ 5564 * update the truncate log dinode */
5397 status = ocfs2_journal_access(handle, tl_inode, tl_bh, 5565 status = ocfs2_journal_access_di(handle, tl_inode, tl_bh,
5398 OCFS2_JOURNAL_ACCESS_WRITE); 5566 OCFS2_JOURNAL_ACCESS_WRITE);
5399 if (status < 0) { 5567 if (status < 0) {
5400 mlog_errno(status); 5568 mlog_errno(status);
5401 goto bail; 5569 goto bail;
@@ -5464,13 +5632,13 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
5464 BUG_ON(mutex_trylock(&tl_inode->i_mutex)); 5632 BUG_ON(mutex_trylock(&tl_inode->i_mutex));
5465 5633
5466 di = (struct ocfs2_dinode *) tl_bh->b_data; 5634 di = (struct ocfs2_dinode *) tl_bh->b_data;
5467 tl = &di->id2.i_dealloc;
5468 if (!OCFS2_IS_VALID_DINODE(di)) {
5469 OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
5470 status = -EIO;
5471 goto out;
5472 }
5473 5635
5636 /* tl_bh is loaded from ocfs2_truncate_log_init(). It's validated
5637 * by the underlying call to ocfs2_read_inode_block(), so any
5638 * corruption is a code bug */
5639 BUG_ON(!OCFS2_IS_VALID_DINODE(di));
5640
5641 tl = &di->id2.i_dealloc;
5474 num_to_flush = le16_to_cpu(tl->tl_used); 5642 num_to_flush = le16_to_cpu(tl->tl_used);
5475 mlog(0, "Flush %u records from truncate log #%llu\n", 5643 mlog(0, "Flush %u records from truncate log #%llu\n",
5476 num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno); 5644 num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno);
@@ -5586,7 +5754,7 @@ static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
5586 goto bail; 5754 goto bail;
5587 } 5755 }
5588 5756
5589 status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh); 5757 status = ocfs2_read_inode_block(inode, &bh);
5590 if (status < 0) { 5758 if (status < 0) {
5591 iput(inode); 5759 iput(inode);
5592 mlog_errno(status); 5760 mlog_errno(status);
@@ -5625,13 +5793,13 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
5625 } 5793 }
5626 5794
5627 di = (struct ocfs2_dinode *) tl_bh->b_data; 5795 di = (struct ocfs2_dinode *) tl_bh->b_data;
5628 tl = &di->id2.i_dealloc;
5629 if (!OCFS2_IS_VALID_DINODE(di)) {
5630 OCFS2_RO_ON_INVALID_DINODE(tl_inode->i_sb, di);
5631 status = -EIO;
5632 goto bail;
5633 }
5634 5796
5797 /* tl_bh is loaded from ocfs2_get_truncate_log_info(). It's
5798 * validated by the underlying call to ocfs2_read_inode_block(),
5799 * so any corruption is a code bug */
5800 BUG_ON(!OCFS2_IS_VALID_DINODE(di));
5801
5802 tl = &di->id2.i_dealloc;
5635 if (le16_to_cpu(tl->tl_used)) { 5803 if (le16_to_cpu(tl->tl_used)) {
5636 mlog(0, "We'll have %u logs to recover\n", 5804 mlog(0, "We'll have %u logs to recover\n",
5637 le16_to_cpu(tl->tl_used)); 5805 le16_to_cpu(tl->tl_used));
@@ -5651,6 +5819,7 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
5651 * tl_used. */ 5819 * tl_used. */
5652 tl->tl_used = 0; 5820 tl->tl_used = 0;
5653 5821
5822 ocfs2_compute_meta_ecc(osb->sb, tl_bh->b_data, &di->i_check);
5654 status = ocfs2_write_block(osb, tl_bh, tl_inode); 5823 status = ocfs2_write_block(osb, tl_bh, tl_inode);
5655 if (status < 0) { 5824 if (status < 0) {
5656 mlog_errno(status); 5825 mlog_errno(status);
@@ -5800,7 +5969,10 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb)
5800 */ 5969 */
5801 5970
5802/* 5971/*
5803 * Describes a single block free from a suballocator 5972 * Describe a single bit freed from a suballocator. For the block
5973 * suballocators, it represents one block. For the global cluster
5974 * allocator, it represents some clusters and free_bit indicates
5975 * clusters number.
5804 */ 5976 */
5805struct ocfs2_cached_block_free { 5977struct ocfs2_cached_block_free {
5806 struct ocfs2_cached_block_free *free_next; 5978 struct ocfs2_cached_block_free *free_next;
@@ -5815,10 +5987,10 @@ struct ocfs2_per_slot_free_list {
5815 struct ocfs2_cached_block_free *f_first; 5987 struct ocfs2_cached_block_free *f_first;
5816}; 5988};
5817 5989
5818static int ocfs2_free_cached_items(struct ocfs2_super *osb, 5990static int ocfs2_free_cached_blocks(struct ocfs2_super *osb,
5819 int sysfile_type, 5991 int sysfile_type,
5820 int slot, 5992 int slot,
5821 struct ocfs2_cached_block_free *head) 5993 struct ocfs2_cached_block_free *head)
5822{ 5994{
5823 int ret; 5995 int ret;
5824 u64 bg_blkno; 5996 u64 bg_blkno;
@@ -5893,6 +6065,82 @@ out:
5893 return ret; 6065 return ret;
5894} 6066}
5895 6067
6068int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
6069 u64 blkno, unsigned int bit)
6070{
6071 int ret = 0;
6072 struct ocfs2_cached_block_free *item;
6073
6074 item = kmalloc(sizeof(*item), GFP_NOFS);
6075 if (item == NULL) {
6076 ret = -ENOMEM;
6077 mlog_errno(ret);
6078 return ret;
6079 }
6080
6081 mlog(0, "Insert clusters: (bit %u, blk %llu)\n",
6082 bit, (unsigned long long)blkno);
6083
6084 item->free_blk = blkno;
6085 item->free_bit = bit;
6086 item->free_next = ctxt->c_global_allocator;
6087
6088 ctxt->c_global_allocator = item;
6089 return ret;
6090}
6091
6092static int ocfs2_free_cached_clusters(struct ocfs2_super *osb,
6093 struct ocfs2_cached_block_free *head)
6094{
6095 struct ocfs2_cached_block_free *tmp;
6096 struct inode *tl_inode = osb->osb_tl_inode;
6097 handle_t *handle;
6098 int ret = 0;
6099
6100 mutex_lock(&tl_inode->i_mutex);
6101
6102 while (head) {
6103 if (ocfs2_truncate_log_needs_flush(osb)) {
6104 ret = __ocfs2_flush_truncate_log(osb);
6105 if (ret < 0) {
6106 mlog_errno(ret);
6107 break;
6108 }
6109 }
6110
6111 handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
6112 if (IS_ERR(handle)) {
6113 ret = PTR_ERR(handle);
6114 mlog_errno(ret);
6115 break;
6116 }
6117
6118 ret = ocfs2_truncate_log_append(osb, handle, head->free_blk,
6119 head->free_bit);
6120
6121 ocfs2_commit_trans(osb, handle);
6122 tmp = head;
6123 head = head->free_next;
6124 kfree(tmp);
6125
6126 if (ret < 0) {
6127 mlog_errno(ret);
6128 break;
6129 }
6130 }
6131
6132 mutex_unlock(&tl_inode->i_mutex);
6133
6134 while (head) {
6135 /* Premature exit may have left some dangling items. */
6136 tmp = head;
6137 head = head->free_next;
6138 kfree(tmp);
6139 }
6140
6141 return ret;
6142}
6143
5896int ocfs2_run_deallocs(struct ocfs2_super *osb, 6144int ocfs2_run_deallocs(struct ocfs2_super *osb,
5897 struct ocfs2_cached_dealloc_ctxt *ctxt) 6145 struct ocfs2_cached_dealloc_ctxt *ctxt)
5898{ 6146{
@@ -5908,8 +6156,10 @@ int ocfs2_run_deallocs(struct ocfs2_super *osb,
5908 if (fl->f_first) { 6156 if (fl->f_first) {
5909 mlog(0, "Free items: (type %u, slot %d)\n", 6157 mlog(0, "Free items: (type %u, slot %d)\n",
5910 fl->f_inode_type, fl->f_slot); 6158 fl->f_inode_type, fl->f_slot);
5911 ret2 = ocfs2_free_cached_items(osb, fl->f_inode_type, 6159 ret2 = ocfs2_free_cached_blocks(osb,
5912 fl->f_slot, fl->f_first); 6160 fl->f_inode_type,
6161 fl->f_slot,
6162 fl->f_first);
5913 if (ret2) 6163 if (ret2)
5914 mlog_errno(ret2); 6164 mlog_errno(ret2);
5915 if (!ret) 6165 if (!ret)
@@ -5920,6 +6170,17 @@ int ocfs2_run_deallocs(struct ocfs2_super *osb,
5920 kfree(fl); 6170 kfree(fl);
5921 } 6171 }
5922 6172
6173 if (ctxt->c_global_allocator) {
6174 ret2 = ocfs2_free_cached_clusters(osb,
6175 ctxt->c_global_allocator);
6176 if (ret2)
6177 mlog_errno(ret2);
6178 if (!ret)
6179 ret = ret2;
6180
6181 ctxt->c_global_allocator = NULL;
6182 }
6183
5923 return ret; 6184 return ret;
5924} 6185}
5925 6186
@@ -6075,11 +6336,10 @@ static int ocfs2_find_new_last_ext_blk(struct inode *inode,
6075 6336
6076 eb = (struct ocfs2_extent_block *) bh->b_data; 6337 eb = (struct ocfs2_extent_block *) bh->b_data;
6077 el = &eb->h_list; 6338 el = &eb->h_list;
6078 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { 6339
6079 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); 6340 /* ocfs2_find_leaf() gets the eb from ocfs2_read_extent_block().
6080 ret = -EROFS; 6341 * Any corruption is a code bug. */
6081 goto out; 6342 BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
6082 }
6083 6343
6084 *new_last_eb = bh; 6344 *new_last_eb = bh;
6085 get_bh(*new_last_eb); 6345 get_bh(*new_last_eb);
@@ -6326,8 +6586,8 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
6326 } 6586 }
6327 6587
6328 if (last_eb_bh) { 6588 if (last_eb_bh) {
6329 status = ocfs2_journal_access(handle, inode, last_eb_bh, 6589 status = ocfs2_journal_access_eb(handle, inode, last_eb_bh,
6330 OCFS2_JOURNAL_ACCESS_WRITE); 6590 OCFS2_JOURNAL_ACCESS_WRITE);
6331 if (status < 0) { 6591 if (status < 0) {
6332 mlog_errno(status); 6592 mlog_errno(status);
6333 goto bail; 6593 goto bail;
@@ -6350,6 +6610,8 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
6350 goto bail; 6610 goto bail;
6351 } 6611 }
6352 6612
6613 vfs_dq_free_space_nodirty(inode,
6614 ocfs2_clusters_to_bytes(osb->sb, clusters_to_del));
6353 spin_lock(&OCFS2_I(inode)->ip_lock); 6615 spin_lock(&OCFS2_I(inode)->ip_lock);
6354 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) - 6616 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
6355 clusters_to_del; 6617 clusters_to_del;
@@ -6436,11 +6698,6 @@ static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
6436 mlog_errno(ret); 6698 mlog_errno(ret);
6437 else if (ocfs2_should_order_data(inode)) { 6699 else if (ocfs2_should_order_data(inode)) {
6438 ret = ocfs2_jbd2_file_inode(handle, inode); 6700 ret = ocfs2_jbd2_file_inode(handle, inode);
6439#ifdef CONFIG_OCFS2_COMPAT_JBD
6440 ret = walk_page_buffers(handle, page_buffers(page),
6441 from, to, &partial,
6442 ocfs2_journal_dirty_data);
6443#endif
6444 if (ret < 0) 6701 if (ret < 0)
6445 mlog_errno(ret); 6702 mlog_errno(ret);
6446 } 6703 }
@@ -6663,6 +6920,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
6663 struct page **pages = NULL; 6920 struct page **pages = NULL;
6664 loff_t end = osb->s_clustersize; 6921 loff_t end = osb->s_clustersize;
6665 struct ocfs2_extent_tree et; 6922 struct ocfs2_extent_tree et;
6923 int did_quota = 0;
6666 6924
6667 has_data = i_size_read(inode) ? 1 : 0; 6925 has_data = i_size_read(inode) ? 1 : 0;
6668 6926
@@ -6682,15 +6940,16 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
6682 } 6940 }
6683 } 6941 }
6684 6942
6685 handle = ocfs2_start_trans(osb, OCFS2_INLINE_TO_EXTENTS_CREDITS); 6943 handle = ocfs2_start_trans(osb,
6944 ocfs2_inline_to_extents_credits(osb->sb));
6686 if (IS_ERR(handle)) { 6945 if (IS_ERR(handle)) {
6687 ret = PTR_ERR(handle); 6946 ret = PTR_ERR(handle);
6688 mlog_errno(ret); 6947 mlog_errno(ret);
6689 goto out_unlock; 6948 goto out_unlock;
6690 } 6949 }
6691 6950
6692 ret = ocfs2_journal_access(handle, inode, di_bh, 6951 ret = ocfs2_journal_access_di(handle, inode, di_bh,
6693 OCFS2_JOURNAL_ACCESS_WRITE); 6952 OCFS2_JOURNAL_ACCESS_WRITE);
6694 if (ret) { 6953 if (ret) {
6695 mlog_errno(ret); 6954 mlog_errno(ret);
6696 goto out_commit; 6955 goto out_commit;
@@ -6701,6 +6960,13 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
6701 unsigned int page_end; 6960 unsigned int page_end;
6702 u64 phys; 6961 u64 phys;
6703 6962
6963 if (vfs_dq_alloc_space_nodirty(inode,
6964 ocfs2_clusters_to_bytes(osb->sb, 1))) {
6965 ret = -EDQUOT;
6966 goto out_commit;
6967 }
6968 did_quota = 1;
6969
6704 ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, 6970 ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
6705 &num); 6971 &num);
6706 if (ret) { 6972 if (ret) {
@@ -6774,6 +7040,10 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
6774 } 7040 }
6775 7041
6776out_commit: 7042out_commit:
7043 if (ret < 0 && did_quota)
7044 vfs_dq_free_space_nodirty(inode,
7045 ocfs2_clusters_to_bytes(osb->sb, 1));
7046
6777 ocfs2_commit_trans(osb, handle); 7047 ocfs2_commit_trans(osb, handle);
6778 7048
6779out_unlock: 7049out_unlock:
@@ -6813,7 +7083,8 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
6813 new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb, 7083 new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
6814 i_size_read(inode)); 7084 i_size_read(inode));
6815 7085
6816 path = ocfs2_new_path(fe_bh, &di->id2.i_list); 7086 path = ocfs2_new_path(fe_bh, &di->id2.i_list,
7087 ocfs2_journal_access_di);
6817 if (!path) { 7088 if (!path) {
6818 status = -ENOMEM; 7089 status = -ENOMEM;
6819 mlog_errno(status); 7090 mlog_errno(status);
@@ -6984,20 +7255,14 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
6984 ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc); 7255 ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
6985 7256
6986 if (fe->id2.i_list.l_tree_depth) { 7257 if (fe->id2.i_list.l_tree_depth) {
6987 status = ocfs2_read_block(inode, le64_to_cpu(fe->i_last_eb_blk), 7258 status = ocfs2_read_extent_block(inode,
6988 &last_eb_bh); 7259 le64_to_cpu(fe->i_last_eb_blk),
7260 &last_eb_bh);
6989 if (status < 0) { 7261 if (status < 0) {
6990 mlog_errno(status); 7262 mlog_errno(status);
6991 goto bail; 7263 goto bail;
6992 } 7264 }
6993 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; 7265 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
6994 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
6995 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
6996
6997 brelse(last_eb_bh);
6998 status = -EIO;
6999 goto bail;
7000 }
7001 } 7266 }
7002 7267
7003 (*tc)->tc_last_eb_bh = last_eb_bh; 7268 (*tc)->tc_last_eb_bh = last_eb_bh;
@@ -7052,8 +7317,8 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
7052 goto out; 7317 goto out;
7053 } 7318 }
7054 7319
7055 ret = ocfs2_journal_access(handle, inode, di_bh, 7320 ret = ocfs2_journal_access_di(handle, inode, di_bh,
7056 OCFS2_JOURNAL_ACCESS_WRITE); 7321 OCFS2_JOURNAL_ACCESS_WRITE);
7057 if (ret) { 7322 if (ret) {
7058 mlog_errno(ret); 7323 mlog_errno(ret);
7059 goto out_commit; 7324 goto out_commit;
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 70257c84cfbe..cceff5c37f47 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -45,7 +45,9 @@
45 * 45 *
46 * ocfs2_extent_tree contains info for the root of the b-tree, it must have a 46 * ocfs2_extent_tree contains info for the root of the b-tree, it must have a
47 * root ocfs2_extent_list and a root_bh so that they can be used in the b-tree 47 * root ocfs2_extent_list and a root_bh so that they can be used in the b-tree
48 * functions. 48 * functions. With metadata ecc, we now call different journal_access
49 * functions for each type of metadata, so it must have the
50 * root_journal_access function.
49 * ocfs2_extent_tree_operations abstract the normal operations we do for 51 * ocfs2_extent_tree_operations abstract the normal operations we do for
50 * the root of extent b-tree. 52 * the root of extent b-tree.
51 */ 53 */
@@ -54,6 +56,7 @@ struct ocfs2_extent_tree {
54 struct ocfs2_extent_tree_operations *et_ops; 56 struct ocfs2_extent_tree_operations *et_ops;
55 struct buffer_head *et_root_bh; 57 struct buffer_head *et_root_bh;
56 struct ocfs2_extent_list *et_root_el; 58 struct ocfs2_extent_list *et_root_el;
59 ocfs2_journal_access_func et_root_journal_access;
57 void *et_object; 60 void *et_object;
58 unsigned int et_max_leaf_clusters; 61 unsigned int et_max_leaf_clusters;
59}; 62};
@@ -68,10 +71,18 @@ void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
68void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et, 71void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
69 struct inode *inode, 72 struct inode *inode,
70 struct buffer_head *bh); 73 struct buffer_head *bh);
74struct ocfs2_xattr_value_buf;
71void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et, 75void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
72 struct inode *inode, 76 struct inode *inode,
73 struct buffer_head *bh, 77 struct ocfs2_xattr_value_buf *vb);
74 struct ocfs2_xattr_value_root *xv); 78
79/*
80 * Read an extent block into *bh. If *bh is NULL, a bh will be
81 * allocated. This is a cached read. The extent block will be validated
82 * with ocfs2_validate_extent_block().
83 */
84int ocfs2_read_extent_block(struct inode *inode, u64 eb_blkno,
85 struct buffer_head **bh);
75 86
76struct ocfs2_alloc_context; 87struct ocfs2_alloc_context;
77int ocfs2_insert_extent(struct ocfs2_super *osb, 88int ocfs2_insert_extent(struct ocfs2_super *osb,
@@ -110,6 +121,11 @@ int ocfs2_remove_extent(struct inode *inode,
110 u32 cpos, u32 len, handle_t *handle, 121 u32 cpos, u32 len, handle_t *handle,
111 struct ocfs2_alloc_context *meta_ac, 122 struct ocfs2_alloc_context *meta_ac,
112 struct ocfs2_cached_dealloc_ctxt *dealloc); 123 struct ocfs2_cached_dealloc_ctxt *dealloc);
124int ocfs2_remove_btree_range(struct inode *inode,
125 struct ocfs2_extent_tree *et,
126 u32 cpos, u32 phys_cpos, u32 len,
127 struct ocfs2_cached_dealloc_ctxt *dealloc);
128
113int ocfs2_num_free_extents(struct ocfs2_super *osb, 129int ocfs2_num_free_extents(struct ocfs2_super *osb,
114 struct inode *inode, 130 struct inode *inode,
115 struct ocfs2_extent_tree *et); 131 struct ocfs2_extent_tree *et);
@@ -167,10 +183,18 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb);
167 */ 183 */
168struct ocfs2_cached_dealloc_ctxt { 184struct ocfs2_cached_dealloc_ctxt {
169 struct ocfs2_per_slot_free_list *c_first_suballocator; 185 struct ocfs2_per_slot_free_list *c_first_suballocator;
186 struct ocfs2_cached_block_free *c_global_allocator;
170}; 187};
171static inline void ocfs2_init_dealloc_ctxt(struct ocfs2_cached_dealloc_ctxt *c) 188static inline void ocfs2_init_dealloc_ctxt(struct ocfs2_cached_dealloc_ctxt *c)
172{ 189{
173 c->c_first_suballocator = NULL; 190 c->c_first_suballocator = NULL;
191 c->c_global_allocator = NULL;
192}
193int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
194 u64 blkno, unsigned int bit);
195static inline int ocfs2_dealloc_has_cluster(struct ocfs2_cached_dealloc_ctxt *c)
196{
197 return c->c_global_allocator != NULL;
174} 198}
175int ocfs2_run_deallocs(struct ocfs2_super *osb, 199int ocfs2_run_deallocs(struct ocfs2_super *osb,
176 struct ocfs2_cached_dealloc_ctxt *ctxt); 200 struct ocfs2_cached_dealloc_ctxt *ctxt);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index c22543b33420..a067a6cffb01 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -27,6 +27,7 @@
27#include <linux/swap.h> 27#include <linux/swap.h>
28#include <linux/pipe_fs_i.h> 28#include <linux/pipe_fs_i.h>
29#include <linux/mpage.h> 29#include <linux/mpage.h>
30#include <linux/quotaops.h>
30 31
31#define MLOG_MASK_PREFIX ML_FILE_IO 32#define MLOG_MASK_PREFIX ML_FILE_IO
32#include <cluster/masklog.h> 33#include <cluster/masklog.h>
@@ -68,20 +69,13 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
68 goto bail; 69 goto bail;
69 } 70 }
70 71
71 status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh); 72 status = ocfs2_read_inode_block(inode, &bh);
72 if (status < 0) { 73 if (status < 0) {
73 mlog_errno(status); 74 mlog_errno(status);
74 goto bail; 75 goto bail;
75 } 76 }
76 fe = (struct ocfs2_dinode *) bh->b_data; 77 fe = (struct ocfs2_dinode *) bh->b_data;
77 78
78 if (!OCFS2_IS_VALID_DINODE(fe)) {
79 mlog(ML_ERROR, "Invalid dinode #%llu: signature = %.*s\n",
80 (unsigned long long)le64_to_cpu(fe->i_blkno), 7,
81 fe->i_signature);
82 goto bail;
83 }
84
85 if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb, 79 if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
86 le32_to_cpu(fe->i_clusters))) { 80 le32_to_cpu(fe->i_clusters))) {
87 mlog(ML_ERROR, "block offset is outside the allocated size: " 81 mlog(ML_ERROR, "block offset is outside the allocated size: "
@@ -262,7 +256,7 @@ static int ocfs2_readpage_inline(struct inode *inode, struct page *page)
262 BUG_ON(!PageLocked(page)); 256 BUG_ON(!PageLocked(page));
263 BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)); 257 BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL));
264 258
265 ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh); 259 ret = ocfs2_read_inode_block(inode, &di_bh);
266 if (ret) { 260 if (ret) {
267 mlog_errno(ret); 261 mlog_errno(ret);
268 goto out; 262 goto out;
@@ -481,12 +475,6 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
481 475
482 if (ocfs2_should_order_data(inode)) { 476 if (ocfs2_should_order_data(inode)) {
483 ret = ocfs2_jbd2_file_inode(handle, inode); 477 ret = ocfs2_jbd2_file_inode(handle, inode);
484#ifdef CONFIG_OCFS2_COMPAT_JBD
485 ret = walk_page_buffers(handle,
486 page_buffers(page),
487 from, to, NULL,
488 ocfs2_journal_dirty_data);
489#endif
490 if (ret < 0) 478 if (ret < 0)
491 mlog_errno(ret); 479 mlog_errno(ret);
492 } 480 }
@@ -1072,15 +1060,8 @@ static void ocfs2_write_failure(struct inode *inode,
1072 tmppage = wc->w_pages[i]; 1060 tmppage = wc->w_pages[i];
1073 1061
1074 if (page_has_buffers(tmppage)) { 1062 if (page_has_buffers(tmppage)) {
1075 if (ocfs2_should_order_data(inode)) { 1063 if (ocfs2_should_order_data(inode))
1076 ocfs2_jbd2_file_inode(wc->w_handle, inode); 1064 ocfs2_jbd2_file_inode(wc->w_handle, inode);
1077#ifdef CONFIG_OCFS2_COMPAT_JBD
1078 walk_page_buffers(wc->w_handle,
1079 page_buffers(tmppage),
1080 from, to, NULL,
1081 ocfs2_journal_dirty_data);
1082#endif
1083 }
1084 1065
1085 block_commit_write(tmppage, from, to); 1066 block_commit_write(tmppage, from, to);
1086 } 1067 }
@@ -1531,8 +1512,8 @@ static int ocfs2_write_begin_inline(struct address_space *mapping,
1531 goto out; 1512 goto out;
1532 } 1513 }
1533 1514
1534 ret = ocfs2_journal_access(handle, inode, wc->w_di_bh, 1515 ret = ocfs2_journal_access_di(handle, inode, wc->w_di_bh,
1535 OCFS2_JOURNAL_ACCESS_WRITE); 1516 OCFS2_JOURNAL_ACCESS_WRITE);
1536 if (ret) { 1517 if (ret) {
1537 ocfs2_commit_trans(osb, handle); 1518 ocfs2_commit_trans(osb, handle);
1538 1519
@@ -1750,15 +1731,20 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
1750 1731
1751 wc->w_handle = handle; 1732 wc->w_handle = handle;
1752 1733
1734 if (clusters_to_alloc && vfs_dq_alloc_space_nodirty(inode,
1735 ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc))) {
1736 ret = -EDQUOT;
1737 goto out_commit;
1738 }
1753 /* 1739 /*
1754 * We don't want this to fail in ocfs2_write_end(), so do it 1740 * We don't want this to fail in ocfs2_write_end(), so do it
1755 * here. 1741 * here.
1756 */ 1742 */
1757 ret = ocfs2_journal_access(handle, inode, wc->w_di_bh, 1743 ret = ocfs2_journal_access_di(handle, inode, wc->w_di_bh,
1758 OCFS2_JOURNAL_ACCESS_WRITE); 1744 OCFS2_JOURNAL_ACCESS_WRITE);
1759 if (ret) { 1745 if (ret) {
1760 mlog_errno(ret); 1746 mlog_errno(ret);
1761 goto out_commit; 1747 goto out_quota;
1762 } 1748 }
1763 1749
1764 /* 1750 /*
@@ -1771,14 +1757,14 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
1771 mmap_page); 1757 mmap_page);
1772 if (ret) { 1758 if (ret) {
1773 mlog_errno(ret); 1759 mlog_errno(ret);
1774 goto out_commit; 1760 goto out_quota;
1775 } 1761 }
1776 1762
1777 ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos, 1763 ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos,
1778 len); 1764 len);
1779 if (ret) { 1765 if (ret) {
1780 mlog_errno(ret); 1766 mlog_errno(ret);
1781 goto out_commit; 1767 goto out_quota;
1782 } 1768 }
1783 1769
1784 if (data_ac) 1770 if (data_ac)
@@ -1790,6 +1776,10 @@ success:
1790 *pagep = wc->w_target_page; 1776 *pagep = wc->w_target_page;
1791 *fsdata = wc; 1777 *fsdata = wc;
1792 return 0; 1778 return 0;
1779out_quota:
1780 if (clusters_to_alloc)
1781 vfs_dq_free_space(inode,
1782 ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc));
1793out_commit: 1783out_commit:
1794 ocfs2_commit_trans(osb, handle); 1784 ocfs2_commit_trans(osb, handle);
1795 1785
@@ -1919,15 +1909,8 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
1919 } 1909 }
1920 1910
1921 if (page_has_buffers(tmppage)) { 1911 if (page_has_buffers(tmppage)) {
1922 if (ocfs2_should_order_data(inode)) { 1912 if (ocfs2_should_order_data(inode))
1923 ocfs2_jbd2_file_inode(wc->w_handle, inode); 1913 ocfs2_jbd2_file_inode(wc->w_handle, inode);
1924#ifdef CONFIG_OCFS2_COMPAT_JBD
1925 walk_page_buffers(wc->w_handle,
1926 page_buffers(tmppage),
1927 from, to, NULL,
1928 ocfs2_journal_dirty_data);
1929#endif
1930 }
1931 block_commit_write(tmppage, from, to); 1914 block_commit_write(tmppage, from, to);
1932 } 1915 }
1933 } 1916 }
diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c
new file mode 100644
index 000000000000..2a947c44e594
--- /dev/null
+++ b/fs/ocfs2/blockcheck.c
@@ -0,0 +1,477 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * blockcheck.c
5 *
6 * Checksum and ECC codes for the OCFS2 userspace library.
7 *
8 * Copyright (C) 2006, 2008 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License, version 2, as published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 */
19
20#include <linux/kernel.h>
21#include <linux/types.h>
22#include <linux/crc32.h>
23#include <linux/buffer_head.h>
24#include <linux/bitops.h>
25#include <asm/byteorder.h>
26
27#include <cluster/masklog.h>
28
29#include "ocfs2.h"
30
31#include "blockcheck.h"
32
33
34/*
35 * We use the following conventions:
36 *
37 * d = # data bits
38 * p = # parity bits
39 * c = # total code bits (d + p)
40 */
41
42
43/*
44 * Calculate the bit offset in the hamming code buffer based on the bit's
45 * offset in the data buffer. Since the hamming code reserves all
46 * power-of-two bits for parity, the data bit number and the code bit
47 * number are offest by all the parity bits beforehand.
48 *
49 * Recall that bit numbers in hamming code are 1-based. This function
50 * takes the 0-based data bit from the caller.
51 *
52 * An example. Take bit 1 of the data buffer. 1 is a power of two (2^0),
53 * so it's a parity bit. 2 is a power of two (2^1), so it's a parity bit.
54 * 3 is not a power of two. So bit 1 of the data buffer ends up as bit 3
55 * in the code buffer.
56 *
57 * The caller can pass in *p if it wants to keep track of the most recent
58 * number of parity bits added. This allows the function to start the
59 * calculation at the last place.
60 */
61static unsigned int calc_code_bit(unsigned int i, unsigned int *p_cache)
62{
63 unsigned int b, p = 0;
64
65 /*
66 * Data bits are 0-based, but we're talking code bits, which
67 * are 1-based.
68 */
69 b = i + 1;
70
71 /* Use the cache if it is there */
72 if (p_cache)
73 p = *p_cache;
74 b += p;
75
76 /*
77 * For every power of two below our bit number, bump our bit.
78 *
79 * We compare with (b + 1) because we have to compare with what b
80 * would be _if_ it were bumped up by the parity bit. Capice?
81 *
82 * p is set above.
83 */
84 for (; (1 << p) < (b + 1); p++)
85 b++;
86
87 if (p_cache)
88 *p_cache = p;
89
90 return b;
91}
92
93/*
94 * This is the low level encoder function. It can be called across
95 * multiple hunks just like the crc32 code. 'd' is the number of bits
96 * _in_this_hunk_. nr is the bit offset of this hunk. So, if you had
97 * two 512B buffers, you would do it like so:
98 *
99 * parity = ocfs2_hamming_encode(0, buf1, 512 * 8, 0);
100 * parity = ocfs2_hamming_encode(parity, buf2, 512 * 8, 512 * 8);
101 *
102 * If you just have one buffer, use ocfs2_hamming_encode_block().
103 */
104u32 ocfs2_hamming_encode(u32 parity, void *data, unsigned int d, unsigned int nr)
105{
106 unsigned int i, b, p = 0;
107
108 BUG_ON(!d);
109
110 /*
111 * b is the hamming code bit number. Hamming code specifies a
112 * 1-based array, but C uses 0-based. So 'i' is for C, and 'b' is
113 * for the algorithm.
114 *
115 * The i++ in the for loop is so that the start offset passed
116 * to ocfs2_find_next_bit_set() is one greater than the previously
117 * found bit.
118 */
119 for (i = 0; (i = ocfs2_find_next_bit(data, d, i)) < d; i++)
120 {
121 /*
122 * i is the offset in this hunk, nr + i is the total bit
123 * offset.
124 */
125 b = calc_code_bit(nr + i, &p);
126
127 /*
128 * Data bits in the resultant code are checked by
129 * parity bits that are part of the bit number
130 * representation. Huh?
131 *
132 * <wikipedia href="http://en.wikipedia.org/wiki/Hamming_code">
133 * In other words, the parity bit at position 2^k
134 * checks bits in positions having bit k set in
135 * their binary representation. Conversely, for
136 * instance, bit 13, i.e. 1101(2), is checked by
137 * bits 1000(2) = 8, 0100(2)=4 and 0001(2) = 1.
138 * </wikipedia>
139 *
140 * Note that 'k' is the _code_ bit number. 'b' in
141 * our loop.
142 */
143 parity ^= b;
144 }
145
146 /* While the data buffer was treated as little endian, the
147 * return value is in host endian. */
148 return parity;
149}
150
151u32 ocfs2_hamming_encode_block(void *data, unsigned int blocksize)
152{
153 return ocfs2_hamming_encode(0, data, blocksize * 8, 0);
154}
155
156/*
157 * Like ocfs2_hamming_encode(), this can handle hunks. nr is the bit
158 * offset of the current hunk. If bit to be fixed is not part of the
159 * current hunk, this does nothing.
160 *
161 * If you only have one hunk, use ocfs2_hamming_fix_block().
162 */
163void ocfs2_hamming_fix(void *data, unsigned int d, unsigned int nr,
164 unsigned int fix)
165{
166 unsigned int i, b;
167
168 BUG_ON(!d);
169
170 /*
171 * If the bit to fix has an hweight of 1, it's a parity bit. One
172 * busted parity bit is its own error. Nothing to do here.
173 */
174 if (hweight32(fix) == 1)
175 return;
176
177 /*
178 * nr + d is the bit right past the data hunk we're looking at.
179 * If fix after that, nothing to do
180 */
181 if (fix >= calc_code_bit(nr + d, NULL))
182 return;
183
184 /*
185 * nr is the offset in the data hunk we're starting at. Let's
186 * start b at the offset in the code buffer. See hamming_encode()
187 * for a more detailed description of 'b'.
188 */
189 b = calc_code_bit(nr, NULL);
190 /* If the fix is before this hunk, nothing to do */
191 if (fix < b)
192 return;
193
194 for (i = 0; i < d; i++, b++)
195 {
196 /* Skip past parity bits */
197 while (hweight32(b) == 1)
198 b++;
199
200 /*
201 * i is the offset in this data hunk.
202 * nr + i is the offset in the total data buffer.
203 * b is the offset in the total code buffer.
204 *
205 * Thus, when b == fix, bit i in the current hunk needs
206 * fixing.
207 */
208 if (b == fix)
209 {
210 if (ocfs2_test_bit(i, data))
211 ocfs2_clear_bit(i, data);
212 else
213 ocfs2_set_bit(i, data);
214 break;
215 }
216 }
217}
218
219void ocfs2_hamming_fix_block(void *data, unsigned int blocksize,
220 unsigned int fix)
221{
222 ocfs2_hamming_fix(data, blocksize * 8, 0, fix);
223}
224
225/*
226 * This function generates check information for a block.
227 * data is the block to be checked. bc is a pointer to the
228 * ocfs2_block_check structure describing the crc32 and the ecc.
229 *
230 * bc should be a pointer inside data, as the function will
231 * take care of zeroing it before calculating the check information. If
232 * bc does not point inside data, the caller must make sure any inline
233 * ocfs2_block_check structures are zeroed.
234 *
235 * The data buffer must be in on-disk endian (little endian for ocfs2).
236 * bc will be filled with little-endian values and will be ready to go to
237 * disk.
238 */
239void ocfs2_block_check_compute(void *data, size_t blocksize,
240 struct ocfs2_block_check *bc)
241{
242 u32 crc;
243 u32 ecc;
244
245 memset(bc, 0, sizeof(struct ocfs2_block_check));
246
247 crc = crc32_le(~0, data, blocksize);
248 ecc = ocfs2_hamming_encode_block(data, blocksize);
249
250 /*
251 * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no
252 * larger than 16 bits.
253 */
254 BUG_ON(ecc > USHORT_MAX);
255
256 bc->bc_crc32e = cpu_to_le32(crc);
257 bc->bc_ecc = cpu_to_le16((u16)ecc);
258}
259
260/*
261 * This function validates existing check information. Like _compute,
262 * the function will take care of zeroing bc before calculating check codes.
263 * If bc is not a pointer inside data, the caller must have zeroed any
264 * inline ocfs2_block_check structures.
265 *
266 * Again, the data passed in should be the on-disk endian.
267 */
268int ocfs2_block_check_validate(void *data, size_t blocksize,
269 struct ocfs2_block_check *bc)
270{
271 int rc = 0;
272 struct ocfs2_block_check check;
273 u32 crc, ecc;
274
275 check.bc_crc32e = le32_to_cpu(bc->bc_crc32e);
276 check.bc_ecc = le16_to_cpu(bc->bc_ecc);
277
278 memset(bc, 0, sizeof(struct ocfs2_block_check));
279
280 /* Fast path - if the crc32 validates, we're good to go */
281 crc = crc32_le(~0, data, blocksize);
282 if (crc == check.bc_crc32e)
283 goto out;
284
285 mlog(ML_ERROR,
286 "CRC32 failed: stored: %u, computed %u. Applying ECC.\n",
287 (unsigned int)check.bc_crc32e, (unsigned int)crc);
288
289 /* Ok, try ECC fixups */
290 ecc = ocfs2_hamming_encode_block(data, blocksize);
291 ocfs2_hamming_fix_block(data, blocksize, ecc ^ check.bc_ecc);
292
293 /* And check the crc32 again */
294 crc = crc32_le(~0, data, blocksize);
295 if (crc == check.bc_crc32e)
296 goto out;
297
298 mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n",
299 (unsigned int)check.bc_crc32e, (unsigned int)crc);
300
301 rc = -EIO;
302
303out:
304 bc->bc_crc32e = cpu_to_le32(check.bc_crc32e);
305 bc->bc_ecc = cpu_to_le16(check.bc_ecc);
306
307 return rc;
308}
309
310/*
311 * This function generates check information for a list of buffer_heads.
312 * bhs is the blocks to be checked. bc is a pointer to the
313 * ocfs2_block_check structure describing the crc32 and the ecc.
314 *
315 * bc should be a pointer inside data, as the function will
316 * take care of zeroing it before calculating the check information. If
317 * bc does not point inside data, the caller must make sure any inline
318 * ocfs2_block_check structures are zeroed.
319 *
320 * The data buffer must be in on-disk endian (little endian for ocfs2).
321 * bc will be filled with little-endian values and will be ready to go to
322 * disk.
323 */
324void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr,
325 struct ocfs2_block_check *bc)
326{
327 int i;
328 u32 crc, ecc;
329
330 BUG_ON(nr < 0);
331
332 if (!nr)
333 return;
334
335 memset(bc, 0, sizeof(struct ocfs2_block_check));
336
337 for (i = 0, crc = ~0, ecc = 0; i < nr; i++) {
338 crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
339 /*
340 * The number of bits in a buffer is obviously b_size*8.
341 * The offset of this buffer is b_size*i, so the bit offset
342 * of this buffer is b_size*8*i.
343 */
344 ecc = (u16)ocfs2_hamming_encode(ecc, bhs[i]->b_data,
345 bhs[i]->b_size * 8,
346 bhs[i]->b_size * 8 * i);
347 }
348
349 /*
350 * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no
351 * larger than 16 bits.
352 */
353 BUG_ON(ecc > USHORT_MAX);
354
355 bc->bc_crc32e = cpu_to_le32(crc);
356 bc->bc_ecc = cpu_to_le16((u16)ecc);
357}
358
359/*
360 * This function validates existing check information on a list of
361 * buffer_heads. Like _compute_bhs, the function will take care of
362 * zeroing bc before calculating check codes. If bc is not a pointer
363 * inside data, the caller must have zeroed any inline
364 * ocfs2_block_check structures.
365 *
366 * Again, the data passed in should be the on-disk endian.
367 */
368int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
369 struct ocfs2_block_check *bc)
370{
371 int i, rc = 0;
372 struct ocfs2_block_check check;
373 u32 crc, ecc, fix;
374
375 BUG_ON(nr < 0);
376
377 if (!nr)
378 return 0;
379
380 check.bc_crc32e = le32_to_cpu(bc->bc_crc32e);
381 check.bc_ecc = le16_to_cpu(bc->bc_ecc);
382
383 memset(bc, 0, sizeof(struct ocfs2_block_check));
384
385 /* Fast path - if the crc32 validates, we're good to go */
386 for (i = 0, crc = ~0; i < nr; i++)
387 crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
388 if (crc == check.bc_crc32e)
389 goto out;
390
391 mlog(ML_ERROR,
392 "CRC32 failed: stored: %u, computed %u. Applying ECC.\n",
393 (unsigned int)check.bc_crc32e, (unsigned int)crc);
394
395 /* Ok, try ECC fixups */
396 for (i = 0, ecc = 0; i < nr; i++) {
397 /*
398 * The number of bits in a buffer is obviously b_size*8.
399 * The offset of this buffer is b_size*i, so the bit offset
400 * of this buffer is b_size*8*i.
401 */
402 ecc = (u16)ocfs2_hamming_encode(ecc, bhs[i]->b_data,
403 bhs[i]->b_size * 8,
404 bhs[i]->b_size * 8 * i);
405 }
406 fix = ecc ^ check.bc_ecc;
407 for (i = 0; i < nr; i++) {
408 /*
409 * Try the fix against each buffer. It will only affect
410 * one of them.
411 */
412 ocfs2_hamming_fix(bhs[i]->b_data, bhs[i]->b_size * 8,
413 bhs[i]->b_size * 8 * i, fix);
414 }
415
416 /* And check the crc32 again */
417 for (i = 0, crc = ~0; i < nr; i++)
418 crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
419 if (crc == check.bc_crc32e)
420 goto out;
421
422 mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n",
423 (unsigned int)check.bc_crc32e, (unsigned int)crc);
424
425 rc = -EIO;
426
427out:
428 bc->bc_crc32e = cpu_to_le32(check.bc_crc32e);
429 bc->bc_ecc = cpu_to_le16(check.bc_ecc);
430
431 return rc;
432}
433
434/*
435 * These are the main API. They check the superblock flag before
436 * calling the underlying operations.
437 *
438 * They expect the buffer(s) to be in disk format.
439 */
440void ocfs2_compute_meta_ecc(struct super_block *sb, void *data,
441 struct ocfs2_block_check *bc)
442{
443 if (ocfs2_meta_ecc(OCFS2_SB(sb)))
444 ocfs2_block_check_compute(data, sb->s_blocksize, bc);
445}
446
447int ocfs2_validate_meta_ecc(struct super_block *sb, void *data,
448 struct ocfs2_block_check *bc)
449{
450 int rc = 0;
451
452 if (ocfs2_meta_ecc(OCFS2_SB(sb)))
453 rc = ocfs2_block_check_validate(data, sb->s_blocksize, bc);
454
455 return rc;
456}
457
458void ocfs2_compute_meta_ecc_bhs(struct super_block *sb,
459 struct buffer_head **bhs, int nr,
460 struct ocfs2_block_check *bc)
461{
462 if (ocfs2_meta_ecc(OCFS2_SB(sb)))
463 ocfs2_block_check_compute_bhs(bhs, nr, bc);
464}
465
466int ocfs2_validate_meta_ecc_bhs(struct super_block *sb,
467 struct buffer_head **bhs, int nr,
468 struct ocfs2_block_check *bc)
469{
470 int rc = 0;
471
472 if (ocfs2_meta_ecc(OCFS2_SB(sb)))
473 rc = ocfs2_block_check_validate_bhs(bhs, nr, bc);
474
475 return rc;
476}
477
diff --git a/fs/ocfs2/blockcheck.h b/fs/ocfs2/blockcheck.h
new file mode 100644
index 000000000000..70ec3feda32f
--- /dev/null
+++ b/fs/ocfs2/blockcheck.h
@@ -0,0 +1,82 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * blockcheck.h
5 *
6 * Checksum and ECC codes for the OCFS2 userspace library.
7 *
8 * Copyright (C) 2004, 2008 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License, version 2, as published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 */
19
20#ifndef OCFS2_BLOCKCHECK_H
21#define OCFS2_BLOCKCHECK_H
22
23
24/* High level block API */
25void ocfs2_compute_meta_ecc(struct super_block *sb, void *data,
26 struct ocfs2_block_check *bc);
27int ocfs2_validate_meta_ecc(struct super_block *sb, void *data,
28 struct ocfs2_block_check *bc);
29void ocfs2_compute_meta_ecc_bhs(struct super_block *sb,
30 struct buffer_head **bhs, int nr,
31 struct ocfs2_block_check *bc);
32int ocfs2_validate_meta_ecc_bhs(struct super_block *sb,
33 struct buffer_head **bhs, int nr,
34 struct ocfs2_block_check *bc);
35
36/* Lower level API */
37void ocfs2_block_check_compute(void *data, size_t blocksize,
38 struct ocfs2_block_check *bc);
39int ocfs2_block_check_validate(void *data, size_t blocksize,
40 struct ocfs2_block_check *bc);
41void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr,
42 struct ocfs2_block_check *bc);
43int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
44 struct ocfs2_block_check *bc);
45
46/*
47 * Hamming code functions
48 */
49
50/*
51 * Encoding hamming code parity bits for a buffer.
52 *
53 * This is the low level encoder function. It can be called across
54 * multiple hunks just like the crc32 code. 'd' is the number of bits
55 * _in_this_hunk_. nr is the bit offset of this hunk. So, if you had
56 * two 512B buffers, you would do it like so:
57 *
58 * parity = ocfs2_hamming_encode(0, buf1, 512 * 8, 0);
59 * parity = ocfs2_hamming_encode(parity, buf2, 512 * 8, 512 * 8);
60 *
61 * If you just have one buffer, use ocfs2_hamming_encode_block().
62 */
63u32 ocfs2_hamming_encode(u32 parity, void *data, unsigned int d,
64 unsigned int nr);
65/*
66 * Fix a buffer with a bit error. The 'fix' is the original parity
67 * xor'd with the parity calculated now.
68 *
69 * Like ocfs2_hamming_encode(), this can handle hunks. nr is the bit
70 * offset of the current hunk. If bit to be fixed is not part of the
71 * current hunk, this does nothing.
72 *
73 * If you only have one buffer, use ocfs2_hamming_fix_block().
74 */
75void ocfs2_hamming_fix(void *data, unsigned int d, unsigned int nr,
76 unsigned int fix);
77
78/* Convenience wrappers for a single buffer of data */
79extern u32 ocfs2_hamming_encode_block(void *data, unsigned int blocksize);
80extern void ocfs2_hamming_fix_block(void *data, unsigned int blocksize,
81 unsigned int fix);
82#endif
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index 3a178ec48d7c..15c8e6deee2e 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -39,6 +39,18 @@
39 39
40#include "buffer_head_io.h" 40#include "buffer_head_io.h"
41 41
42/*
43 * Bits on bh->b_state used by ocfs2.
44 *
45 * These MUST be after the JBD2 bits. Hence, we use BH_JBDPrivateStart.
46 */
47enum ocfs2_state_bits {
48 BH_NeedsValidate = BH_JBDPrivateStart,
49};
50
51/* Expand the magic b_state functions */
52BUFFER_FNS(NeedsValidate, needs_validate);
53
42int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh, 54int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
43 struct inode *inode) 55 struct inode *inode)
44{ 56{
@@ -166,7 +178,9 @@ bail:
166} 178}
167 179
168int ocfs2_read_blocks(struct inode *inode, u64 block, int nr, 180int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
169 struct buffer_head *bhs[], int flags) 181 struct buffer_head *bhs[], int flags,
182 int (*validate)(struct super_block *sb,
183 struct buffer_head *bh))
170{ 184{
171 int status = 0; 185 int status = 0;
172 int i, ignore_cache = 0; 186 int i, ignore_cache = 0;
@@ -298,6 +312,8 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
298 312
299 clear_buffer_uptodate(bh); 313 clear_buffer_uptodate(bh);
300 get_bh(bh); /* for end_buffer_read_sync() */ 314 get_bh(bh); /* for end_buffer_read_sync() */
315 if (validate)
316 set_buffer_needs_validate(bh);
301 bh->b_end_io = end_buffer_read_sync; 317 bh->b_end_io = end_buffer_read_sync;
302 submit_bh(READ, bh); 318 submit_bh(READ, bh);
303 continue; 319 continue;
@@ -328,6 +344,20 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
328 bhs[i] = NULL; 344 bhs[i] = NULL;
329 continue; 345 continue;
330 } 346 }
347
348 if (buffer_needs_validate(bh)) {
349 /* We never set NeedsValidate if the
350 * buffer was held by the journal, so
351 * that better not have changed */
352 BUG_ON(buffer_jbd(bh));
353 clear_buffer_needs_validate(bh);
354 status = validate(inode->i_sb, bh);
355 if (status) {
356 put_bh(bh);
357 bhs[i] = NULL;
358 continue;
359 }
360 }
331 } 361 }
332 362
333 /* Always set the buffer in the cache, even if it was 363 /* Always set the buffer in the cache, even if it was
diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h
index 75e1dcb1ade7..c75d682dadd8 100644
--- a/fs/ocfs2/buffer_head_io.h
+++ b/fs/ocfs2/buffer_head_io.h
@@ -31,21 +31,24 @@
31void ocfs2_end_buffer_io_sync(struct buffer_head *bh, 31void ocfs2_end_buffer_io_sync(struct buffer_head *bh,
32 int uptodate); 32 int uptodate);
33 33
34static inline int ocfs2_read_block(struct inode *inode,
35 u64 off,
36 struct buffer_head **bh);
37
38int ocfs2_write_block(struct ocfs2_super *osb, 34int ocfs2_write_block(struct ocfs2_super *osb,
39 struct buffer_head *bh, 35 struct buffer_head *bh,
40 struct inode *inode); 36 struct inode *inode);
41int ocfs2_read_blocks(struct inode *inode,
42 u64 block,
43 int nr,
44 struct buffer_head *bhs[],
45 int flags);
46int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block, 37int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
47 unsigned int nr, struct buffer_head *bhs[]); 38 unsigned int nr, struct buffer_head *bhs[]);
48 39
40/*
41 * If not NULL, validate() will be called on a buffer that is freshly
42 * read from disk. It will not be called if the buffer was in cache.
43 * Note that if validate() is being used for this buffer, it needs to
44 * be set even for a READAHEAD call, as it marks the buffer for later
45 * validation.
46 */
47int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
48 struct buffer_head *bhs[], int flags,
49 int (*validate)(struct super_block *sb,
50 struct buffer_head *bh));
51
49int ocfs2_write_super_or_backup(struct ocfs2_super *osb, 52int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
50 struct buffer_head *bh); 53 struct buffer_head *bh);
51 54
@@ -53,7 +56,9 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
53#define OCFS2_BH_READAHEAD 8 56#define OCFS2_BH_READAHEAD 8
54 57
55static inline int ocfs2_read_block(struct inode *inode, u64 off, 58static inline int ocfs2_read_block(struct inode *inode, u64 off,
56 struct buffer_head **bh) 59 struct buffer_head **bh,
60 int (*validate)(struct super_block *sb,
61 struct buffer_head *bh))
57{ 62{
58 int status = 0; 63 int status = 0;
59 64
@@ -63,7 +68,7 @@ static inline int ocfs2_read_block(struct inode *inode, u64 off,
63 goto bail; 68 goto bail;
64 } 69 }
65 70
66 status = ocfs2_read_blocks(inode, off, 1, bh, 0); 71 status = ocfs2_read_blocks(inode, off, 1, bh, 0, validate);
67 72
68bail: 73bail:
69 return status; 74 return status;
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 6ebaa58e2c03..04697ba7f73e 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -854,7 +854,7 @@ static int o2hb_thread(void *data)
854 854
855 while (!kthread_should_stop() && !reg->hr_unclean_stop) { 855 while (!kthread_should_stop() && !reg->hr_unclean_stop) {
856 /* We track the time spent inside 856 /* We track the time spent inside
857 * o2hb_do_disk_heartbeat so that we avoid more then 857 * o2hb_do_disk_heartbeat so that we avoid more than
858 * hr_timeout_ms between disk writes. On busy systems 858 * hr_timeout_ms between disk writes. On busy systems
859 * this should result in a heartbeat which is less 859 * this should result in a heartbeat which is less
860 * likely to time itself out. */ 860 * likely to time itself out. */
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index d8a0cb92cef6..96df5416993e 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -110,6 +110,7 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
110 define_mask(QUORUM), 110 define_mask(QUORUM),
111 define_mask(EXPORT), 111 define_mask(EXPORT),
112 define_mask(XATTR), 112 define_mask(XATTR),
113 define_mask(QUOTA),
113 define_mask(ERROR), 114 define_mask(ERROR),
114 define_mask(NOTICE), 115 define_mask(NOTICE),
115 define_mask(KTHREAD), 116 define_mask(KTHREAD),
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 57670c680471..7e72a81bc2d4 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -113,6 +113,7 @@
113#define ML_QUORUM 0x0000000008000000ULL /* net connection quorum */ 113#define ML_QUORUM 0x0000000008000000ULL /* net connection quorum */
114#define ML_EXPORT 0x0000000010000000ULL /* ocfs2 export operations */ 114#define ML_EXPORT 0x0000000010000000ULL /* ocfs2 export operations */
115#define ML_XATTR 0x0000000020000000ULL /* ocfs2 extended attributes */ 115#define ML_XATTR 0x0000000020000000ULL /* ocfs2 extended attributes */
116#define ML_QUOTA 0x0000000040000000ULL /* ocfs2 quota operations */
116/* bits that are infrequently given and frequently matched in the high word */ 117/* bits that are infrequently given and frequently matched in the high word */
117#define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */ 118#define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */
118#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */ 119#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index b1cc7c381e88..e9d7c2038c0f 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -38,6 +38,7 @@
38#include "dlmglue.h" 38#include "dlmglue.h"
39#include "file.h" 39#include "file.h"
40#include "inode.h" 40#include "inode.h"
41#include "super.h"
41 42
42 43
43static int ocfs2_dentry_revalidate(struct dentry *dentry, 44static int ocfs2_dentry_revalidate(struct dentry *dentry,
@@ -294,6 +295,34 @@ out_attach:
294 return ret; 295 return ret;
295} 296}
296 297
298static DEFINE_SPINLOCK(dentry_list_lock);
299
300/* We limit the number of dentry locks to drop in one go. We have
301 * this limit so that we don't starve other users of ocfs2_wq. */
302#define DL_INODE_DROP_COUNT 64
303
304/* Drop inode references from dentry locks */
305void ocfs2_drop_dl_inodes(struct work_struct *work)
306{
307 struct ocfs2_super *osb = container_of(work, struct ocfs2_super,
308 dentry_lock_work);
309 struct ocfs2_dentry_lock *dl;
310 int drop_count = DL_INODE_DROP_COUNT;
311
312 spin_lock(&dentry_list_lock);
313 while (osb->dentry_lock_list && drop_count--) {
314 dl = osb->dentry_lock_list;
315 osb->dentry_lock_list = dl->dl_next;
316 spin_unlock(&dentry_list_lock);
317 iput(dl->dl_inode);
318 kfree(dl);
319 spin_lock(&dentry_list_lock);
320 }
321 if (osb->dentry_lock_list)
322 queue_work(ocfs2_wq, &osb->dentry_lock_work);
323 spin_unlock(&dentry_list_lock);
324}
325
297/* 326/*
298 * ocfs2_dentry_iput() and friends. 327 * ocfs2_dentry_iput() and friends.
299 * 328 *
@@ -318,16 +347,23 @@ out_attach:
318static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb, 347static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb,
319 struct ocfs2_dentry_lock *dl) 348 struct ocfs2_dentry_lock *dl)
320{ 349{
321 iput(dl->dl_inode);
322 ocfs2_simple_drop_lockres(osb, &dl->dl_lockres); 350 ocfs2_simple_drop_lockres(osb, &dl->dl_lockres);
323 ocfs2_lock_res_free(&dl->dl_lockres); 351 ocfs2_lock_res_free(&dl->dl_lockres);
324 kfree(dl); 352
353 /* We leave dropping of inode reference to ocfs2_wq as that can
354 * possibly lead to inode deletion which gets tricky */
355 spin_lock(&dentry_list_lock);
356 if (!osb->dentry_lock_list)
357 queue_work(ocfs2_wq, &osb->dentry_lock_work);
358 dl->dl_next = osb->dentry_lock_list;
359 osb->dentry_lock_list = dl;
360 spin_unlock(&dentry_list_lock);
325} 361}
326 362
327void ocfs2_dentry_lock_put(struct ocfs2_super *osb, 363void ocfs2_dentry_lock_put(struct ocfs2_super *osb,
328 struct ocfs2_dentry_lock *dl) 364 struct ocfs2_dentry_lock *dl)
329{ 365{
330 int unlock = 0; 366 int unlock;
331 367
332 BUG_ON(dl->dl_count == 0); 368 BUG_ON(dl->dl_count == 0);
333 369
diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h
index c091c34d9883..d06e16c06640 100644
--- a/fs/ocfs2/dcache.h
+++ b/fs/ocfs2/dcache.h
@@ -29,8 +29,13 @@
29extern struct dentry_operations ocfs2_dentry_ops; 29extern struct dentry_operations ocfs2_dentry_ops;
30 30
31struct ocfs2_dentry_lock { 31struct ocfs2_dentry_lock {
32 /* Use count of dentry lock */
32 unsigned int dl_count; 33 unsigned int dl_count;
33 u64 dl_parent_blkno; 34 union {
35 /* Linked list of dentry locks to release */
36 struct ocfs2_dentry_lock *dl_next;
37 u64 dl_parent_blkno;
38 };
34 39
35 /* 40 /*
36 * The ocfs2_dentry_lock keeps an inode reference until 41 * The ocfs2_dentry_lock keeps an inode reference until
@@ -47,6 +52,8 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry, struct inode *inode,
47void ocfs2_dentry_lock_put(struct ocfs2_super *osb, 52void ocfs2_dentry_lock_put(struct ocfs2_super *osb,
48 struct ocfs2_dentry_lock *dl); 53 struct ocfs2_dentry_lock *dl);
49 54
55void ocfs2_drop_dl_inodes(struct work_struct *work);
56
50struct dentry *ocfs2_find_local_alias(struct inode *inode, u64 parent_blkno, 57struct dentry *ocfs2_find_local_alias(struct inode *inode, u64 parent_blkno,
51 int skip_unhashed); 58 int skip_unhashed);
52 59
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 026e6eb85187..f2c4098cf337 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -40,6 +40,7 @@
40#include <linux/types.h> 40#include <linux/types.h>
41#include <linux/slab.h> 41#include <linux/slab.h>
42#include <linux/highmem.h> 42#include <linux/highmem.h>
43#include <linux/quotaops.h>
43 44
44#define MLOG_MASK_PREFIX ML_NAMEI 45#define MLOG_MASK_PREFIX ML_NAMEI
45#include <cluster/masklog.h> 46#include <cluster/masklog.h>
@@ -47,6 +48,7 @@
47#include "ocfs2.h" 48#include "ocfs2.h"
48 49
49#include "alloc.h" 50#include "alloc.h"
51#include "blockcheck.h"
50#include "dir.h" 52#include "dir.h"
51#include "dlmglue.h" 53#include "dlmglue.h"
52#include "extent_map.h" 54#include "extent_map.h"
@@ -82,47 +84,72 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
82 struct ocfs2_alloc_context *meta_ac, 84 struct ocfs2_alloc_context *meta_ac,
83 struct buffer_head **new_bh); 85 struct buffer_head **new_bh);
84 86
85static struct buffer_head *ocfs2_bread(struct inode *inode, 87/*
86 int block, int *err, int reada) 88 * These are distinct checks because future versions of the file system will
89 * want to have a trailing dirent structure independent of indexing.
90 */
91static int ocfs2_dir_has_trailer(struct inode *dir)
87{ 92{
88 struct buffer_head *bh = NULL; 93 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
89 int tmperr; 94 return 0;
90 u64 p_blkno;
91 int readflags = 0;
92 95
93 if (reada) 96 return ocfs2_meta_ecc(OCFS2_SB(dir->i_sb));
94 readflags |= OCFS2_BH_READAHEAD; 97}
95 98
96 if (((u64)block << inode->i_sb->s_blocksize_bits) >= 99static int ocfs2_supports_dir_trailer(struct ocfs2_super *osb)
97 i_size_read(inode)) { 100{
98 BUG_ON(!reada); 101 return ocfs2_meta_ecc(osb);
99 return NULL; 102}
100 }
101 103
102 down_read(&OCFS2_I(inode)->ip_alloc_sem); 104static inline unsigned int ocfs2_dir_trailer_blk_off(struct super_block *sb)
103 tmperr = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL, 105{
104 NULL); 106 return sb->s_blocksize - sizeof(struct ocfs2_dir_block_trailer);
105 up_read(&OCFS2_I(inode)->ip_alloc_sem); 107}
106 if (tmperr < 0) {
107 mlog_errno(tmperr);
108 goto fail;
109 }
110 108
111 tmperr = ocfs2_read_blocks(inode, p_blkno, 1, &bh, readflags); 109#define ocfs2_trailer_from_bh(_bh, _sb) ((struct ocfs2_dir_block_trailer *) ((_bh)->b_data + ocfs2_dir_trailer_blk_off((_sb))))
112 if (tmperr < 0)
113 goto fail;
114 110
115 tmperr = 0; 111/* XXX ocfs2_block_dqtrailer() is similar but not quite - can we make
112 * them more consistent? */
113struct ocfs2_dir_block_trailer *ocfs2_dir_trailer_from_size(int blocksize,
114 void *data)
115{
116 char *p = data;
116 117
117 *err = 0; 118 p += blocksize - sizeof(struct ocfs2_dir_block_trailer);
118 return bh; 119 return (struct ocfs2_dir_block_trailer *)p;
120}
119 121
120fail: 122/*
121 brelse(bh); 123 * XXX: This is executed once on every dirent. We should consider optimizing
122 bh = NULL; 124 * it.
125 */
126static int ocfs2_skip_dir_trailer(struct inode *dir,
127 struct ocfs2_dir_entry *de,
128 unsigned long offset,
129 unsigned long blklen)
130{
131 unsigned long toff = blklen - sizeof(struct ocfs2_dir_block_trailer);
123 132
124 *err = -EIO; 133 if (!ocfs2_dir_has_trailer(dir))
125 return NULL; 134 return 0;
135
136 if (offset != toff)
137 return 0;
138
139 return 1;
140}
141
142static void ocfs2_init_dir_trailer(struct inode *inode,
143 struct buffer_head *bh)
144{
145 struct ocfs2_dir_block_trailer *trailer;
146
147 trailer = ocfs2_trailer_from_bh(bh, inode->i_sb);
148 strcpy(trailer->db_signature, OCFS2_DIR_TRAILER_SIGNATURE);
149 trailer->db_compat_rec_len =
150 cpu_to_le16(sizeof(struct ocfs2_dir_block_trailer));
151 trailer->db_parent_dinode = cpu_to_le64(OCFS2_I(inode)->ip_blkno);
152 trailer->db_blkno = cpu_to_le64(bh->b_blocknr);
126} 153}
127 154
128/* 155/*
@@ -231,7 +258,7 @@ static struct buffer_head *ocfs2_find_entry_id(const char *name,
231 struct ocfs2_dinode *di; 258 struct ocfs2_dinode *di;
232 struct ocfs2_inline_data *data; 259 struct ocfs2_inline_data *data;
233 260
234 ret = ocfs2_read_block(dir, OCFS2_I(dir)->ip_blkno, &di_bh); 261 ret = ocfs2_read_inode_block(dir, &di_bh);
235 if (ret) { 262 if (ret) {
236 mlog_errno(ret); 263 mlog_errno(ret);
237 goto out; 264 goto out;
@@ -250,6 +277,108 @@ out:
250 return NULL; 277 return NULL;
251} 278}
252 279
280static int ocfs2_validate_dir_block(struct super_block *sb,
281 struct buffer_head *bh)
282{
283 int rc;
284 struct ocfs2_dir_block_trailer *trailer =
285 ocfs2_trailer_from_bh(bh, sb);
286
287
288 /*
289 * We don't validate dirents here, that's handled
290 * in-place when the code walks them.
291 */
292 mlog(0, "Validating dirblock %llu\n",
293 (unsigned long long)bh->b_blocknr);
294
295 BUG_ON(!buffer_uptodate(bh));
296
297 /*
298 * If the ecc fails, we return the error but otherwise
299 * leave the filesystem running. We know any error is
300 * local to this block.
301 *
302 * Note that we are safe to call this even if the directory
303 * doesn't have a trailer. Filesystems without metaecc will do
304 * nothing, and filesystems with it will have one.
305 */
306 rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &trailer->db_check);
307 if (rc)
308 mlog(ML_ERROR, "Checksum failed for dinode %llu\n",
309 (unsigned long long)bh->b_blocknr);
310
311 return rc;
312}
313
314/*
315 * This function forces all errors to -EIO for consistency with its
316 * predecessor, ocfs2_bread(). We haven't audited what returning the
317 * real error codes would do to callers. We log the real codes with
318 * mlog_errno() before we squash them.
319 */
320static int ocfs2_read_dir_block(struct inode *inode, u64 v_block,
321 struct buffer_head **bh, int flags)
322{
323 int rc = 0;
324 struct buffer_head *tmp = *bh;
325 struct ocfs2_dir_block_trailer *trailer;
326
327 rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, flags,
328 ocfs2_validate_dir_block);
329 if (rc) {
330 mlog_errno(rc);
331 goto out;
332 }
333
334 /*
335 * We check the trailer here rather than in
336 * ocfs2_validate_dir_block() because that function doesn't have
337 * the inode to test.
338 */
339 if (!(flags & OCFS2_BH_READAHEAD) &&
340 ocfs2_dir_has_trailer(inode)) {
341 trailer = ocfs2_trailer_from_bh(tmp, inode->i_sb);
342 if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) {
343 rc = -EINVAL;
344 ocfs2_error(inode->i_sb,
345 "Invalid dirblock #%llu: "
346 "signature = %.*s\n",
347 (unsigned long long)tmp->b_blocknr, 7,
348 trailer->db_signature);
349 goto out;
350 }
351 if (le64_to_cpu(trailer->db_blkno) != tmp->b_blocknr) {
352 rc = -EINVAL;
353 ocfs2_error(inode->i_sb,
354 "Directory block #%llu has an invalid "
355 "db_blkno of %llu",
356 (unsigned long long)tmp->b_blocknr,
357 (unsigned long long)le64_to_cpu(trailer->db_blkno));
358 goto out;
359 }
360 if (le64_to_cpu(trailer->db_parent_dinode) !=
361 OCFS2_I(inode)->ip_blkno) {
362 rc = -EINVAL;
363 ocfs2_error(inode->i_sb,
364 "Directory block #%llu on dinode "
365 "#%llu has an invalid parent_dinode "
366 "of %llu",
367 (unsigned long long)tmp->b_blocknr,
368 (unsigned long long)OCFS2_I(inode)->ip_blkno,
369 (unsigned long long)le64_to_cpu(trailer->db_blkno));
370 goto out;
371 }
372 }
373
374 /* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */
375 if (!*bh)
376 *bh = tmp;
377
378out:
379 return rc ? -EIO : 0;
380}
381
253static struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen, 382static struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen,
254 struct inode *dir, 383 struct inode *dir,
255 struct ocfs2_dir_entry **res_dir) 384 struct ocfs2_dir_entry **res_dir)
@@ -296,15 +425,17 @@ restart:
296 } 425 }
297 num++; 426 num++;
298 427
299 bh = ocfs2_bread(dir, b++, &err, 1); 428 bh = NULL;
429 err = ocfs2_read_dir_block(dir, b++, &bh,
430 OCFS2_BH_READAHEAD);
300 bh_use[ra_max] = bh; 431 bh_use[ra_max] = bh;
301 } 432 }
302 } 433 }
303 if ((bh = bh_use[ra_ptr++]) == NULL) 434 if ((bh = bh_use[ra_ptr++]) == NULL)
304 goto next; 435 goto next;
305 if (ocfs2_read_block(dir, block, &bh)) { 436 if (ocfs2_read_dir_block(dir, block, &bh, 0)) {
306 /* read error, skip block & hope for the best. 437 /* read error, skip block & hope for the best.
307 * ocfs2_read_block() has released the bh. */ 438 * ocfs2_read_dir_block() has released the bh. */
308 ocfs2_error(dir->i_sb, "reading directory %llu, " 439 ocfs2_error(dir->i_sb, "reading directory %llu, "
309 "offset %lu\n", 440 "offset %lu\n",
310 (unsigned long long)OCFS2_I(dir)->ip_blkno, 441 (unsigned long long)OCFS2_I(dir)->ip_blkno,
@@ -381,14 +512,18 @@ int ocfs2_update_entry(struct inode *dir, handle_t *handle,
381 struct inode *new_entry_inode) 512 struct inode *new_entry_inode)
382{ 513{
383 int ret; 514 int ret;
515 ocfs2_journal_access_func access = ocfs2_journal_access_db;
384 516
385 /* 517 /*
386 * The same code works fine for both inline-data and extent 518 * The same code works fine for both inline-data and extent
387 * based directories, so no need to split this up. 519 * based directories, so no need to split this up. The only
520 * difference is the journal_access function.
388 */ 521 */
389 522
390 ret = ocfs2_journal_access(handle, dir, de_bh, 523 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
391 OCFS2_JOURNAL_ACCESS_WRITE); 524 access = ocfs2_journal_access_di;
525
526 ret = access(handle, dir, de_bh, OCFS2_JOURNAL_ACCESS_WRITE);
392 if (ret) { 527 if (ret) {
393 mlog_errno(ret); 528 mlog_errno(ret);
394 goto out; 529 goto out;
@@ -410,9 +545,13 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
410{ 545{
411 struct ocfs2_dir_entry *de, *pde; 546 struct ocfs2_dir_entry *de, *pde;
412 int i, status = -ENOENT; 547 int i, status = -ENOENT;
548 ocfs2_journal_access_func access = ocfs2_journal_access_db;
413 549
414 mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p)\n", handle, dir, de_del, bh); 550 mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p)\n", handle, dir, de_del, bh);
415 551
552 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
553 access = ocfs2_journal_access_di;
554
416 i = 0; 555 i = 0;
417 pde = NULL; 556 pde = NULL;
418 de = (struct ocfs2_dir_entry *) first_de; 557 de = (struct ocfs2_dir_entry *) first_de;
@@ -423,8 +562,8 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
423 goto bail; 562 goto bail;
424 } 563 }
425 if (de == de_del) { 564 if (de == de_del) {
426 status = ocfs2_journal_access(handle, dir, bh, 565 status = access(handle, dir, bh,
427 OCFS2_JOURNAL_ACCESS_WRITE); 566 OCFS2_JOURNAL_ACCESS_WRITE);
428 if (status < 0) { 567 if (status < 0) {
429 status = -EIO; 568 status = -EIO;
430 mlog_errno(status); 569 mlog_errno(status);
@@ -458,7 +597,7 @@ static inline int ocfs2_delete_entry_id(handle_t *handle,
458 struct ocfs2_dinode *di; 597 struct ocfs2_dinode *di;
459 struct ocfs2_inline_data *data; 598 struct ocfs2_inline_data *data;
460 599
461 ret = ocfs2_read_block(dir, OCFS2_I(dir)->ip_blkno, &di_bh); 600 ret = ocfs2_read_inode_block(dir, &di_bh);
462 if (ret) { 601 if (ret) {
463 mlog_errno(ret); 602 mlog_errno(ret);
464 goto out; 603 goto out;
@@ -576,6 +715,16 @@ int __ocfs2_add_entry(handle_t *handle,
576 goto bail; 715 goto bail;
577 } 716 }
578 717
718 /* We're guaranteed that we should have space, so we
719 * can't possibly have hit the trailer...right? */
720 mlog_bug_on_msg(ocfs2_skip_dir_trailer(dir, de, offset, size),
721 "Hit dir trailer trying to insert %.*s "
722 "(namelen %d) into directory %llu. "
723 "offset is %lu, trailer offset is %d\n",
724 namelen, name, namelen,
725 (unsigned long long)parent_fe_bh->b_blocknr,
726 offset, ocfs2_dir_trailer_blk_off(dir->i_sb));
727
579 if (ocfs2_dirent_would_fit(de, rec_len)) { 728 if (ocfs2_dirent_would_fit(de, rec_len)) {
580 dir->i_mtime = dir->i_ctime = CURRENT_TIME; 729 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
581 retval = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh); 730 retval = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
@@ -584,8 +733,14 @@ int __ocfs2_add_entry(handle_t *handle,
584 goto bail; 733 goto bail;
585 } 734 }
586 735
587 status = ocfs2_journal_access(handle, dir, insert_bh, 736 if (insert_bh == parent_fe_bh)
588 OCFS2_JOURNAL_ACCESS_WRITE); 737 status = ocfs2_journal_access_di(handle, dir,
738 insert_bh,
739 OCFS2_JOURNAL_ACCESS_WRITE);
740 else
741 status = ocfs2_journal_access_db(handle, dir,
742 insert_bh,
743 OCFS2_JOURNAL_ACCESS_WRITE);
589 /* By now the buffer is marked for journaling */ 744 /* By now the buffer is marked for journaling */
590 offset += le16_to_cpu(de->rec_len); 745 offset += le16_to_cpu(de->rec_len);
591 if (le64_to_cpu(de->inode)) { 746 if (le64_to_cpu(de->inode)) {
@@ -611,6 +766,7 @@ int __ocfs2_add_entry(handle_t *handle,
611 retval = 0; 766 retval = 0;
612 goto bail; 767 goto bail;
613 } 768 }
769
614 offset += le16_to_cpu(de->rec_len); 770 offset += le16_to_cpu(de->rec_len);
615 de = (struct ocfs2_dir_entry *) ((char *) de + le16_to_cpu(de->rec_len)); 771 de = (struct ocfs2_dir_entry *) ((char *) de + le16_to_cpu(de->rec_len));
616 } 772 }
@@ -636,7 +792,7 @@ static int ocfs2_dir_foreach_blk_id(struct inode *inode,
636 struct ocfs2_inline_data *data; 792 struct ocfs2_inline_data *data;
637 struct ocfs2_dir_entry *de; 793 struct ocfs2_dir_entry *de;
638 794
639 ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh); 795 ret = ocfs2_read_inode_block(inode, &di_bh);
640 if (ret) { 796 if (ret) {
641 mlog(ML_ERROR, "Unable to read inode block for dir %llu\n", 797 mlog(ML_ERROR, "Unable to read inode block for dir %llu\n",
642 (unsigned long long)OCFS2_I(inode)->ip_blkno); 798 (unsigned long long)OCFS2_I(inode)->ip_blkno);
@@ -724,7 +880,6 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
724 int i, stored; 880 int i, stored;
725 struct buffer_head * bh, * tmp; 881 struct buffer_head * bh, * tmp;
726 struct ocfs2_dir_entry * de; 882 struct ocfs2_dir_entry * de;
727 int err;
728 struct super_block * sb = inode->i_sb; 883 struct super_block * sb = inode->i_sb;
729 unsigned int ra_sectors = 16; 884 unsigned int ra_sectors = 16;
730 885
@@ -735,12 +890,8 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
735 890
736 while (!error && !stored && *f_pos < i_size_read(inode)) { 891 while (!error && !stored && *f_pos < i_size_read(inode)) {
737 blk = (*f_pos) >> sb->s_blocksize_bits; 892 blk = (*f_pos) >> sb->s_blocksize_bits;
738 bh = ocfs2_bread(inode, blk, &err, 0); 893 if (ocfs2_read_dir_block(inode, blk, &bh, 0)) {
739 if (!bh) { 894 /* Skip the corrupt dirblock and keep trying */
740 mlog(ML_ERROR,
741 "directory #%llu contains a hole at offset %lld\n",
742 (unsigned long long)OCFS2_I(inode)->ip_blkno,
743 *f_pos);
744 *f_pos += sb->s_blocksize - offset; 895 *f_pos += sb->s_blocksize - offset;
745 continue; 896 continue;
746 } 897 }
@@ -754,8 +905,10 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
754 || (((last_ra_blk - blk) << 9) <= (ra_sectors / 2))) { 905 || (((last_ra_blk - blk) << 9) <= (ra_sectors / 2))) {
755 for (i = ra_sectors >> (sb->s_blocksize_bits - 9); 906 for (i = ra_sectors >> (sb->s_blocksize_bits - 9);
756 i > 0; i--) { 907 i > 0; i--) {
757 tmp = ocfs2_bread(inode, ++blk, &err, 1); 908 tmp = NULL;
758 brelse(tmp); 909 if (!ocfs2_read_dir_block(inode, ++blk, &tmp,
910 OCFS2_BH_READAHEAD))
911 brelse(tmp);
759 } 912 }
760 last_ra_blk = blk; 913 last_ra_blk = blk;
761 ra_sectors = 8; 914 ra_sectors = 8;
@@ -828,6 +981,7 @@ revalidate:
828 } 981 }
829 offset = 0; 982 offset = 0;
830 brelse(bh); 983 brelse(bh);
984 bh = NULL;
831 } 985 }
832 986
833 stored = 0; 987 stored = 0;
@@ -1050,9 +1204,15 @@ int ocfs2_empty_dir(struct inode *inode)
1050 return !priv.seen_other; 1204 return !priv.seen_other;
1051} 1205}
1052 1206
1053static void ocfs2_fill_initial_dirents(struct inode *inode, 1207/*
1054 struct inode *parent, 1208 * Fills "." and ".." dirents in a new directory block. Returns dirent for
1055 char *start, unsigned int size) 1209 * "..", which might be used during creation of a directory with a trailing
1210 * header. It is otherwise safe to ignore the return code.
1211 */
1212static struct ocfs2_dir_entry *ocfs2_fill_initial_dirents(struct inode *inode,
1213 struct inode *parent,
1214 char *start,
1215 unsigned int size)
1056{ 1216{
1057 struct ocfs2_dir_entry *de = (struct ocfs2_dir_entry *)start; 1217 struct ocfs2_dir_entry *de = (struct ocfs2_dir_entry *)start;
1058 1218
@@ -1069,6 +1229,8 @@ static void ocfs2_fill_initial_dirents(struct inode *inode,
1069 de->name_len = 2; 1229 de->name_len = 2;
1070 strcpy(de->name, ".."); 1230 strcpy(de->name, "..");
1071 ocfs2_set_de_type(de, S_IFDIR); 1231 ocfs2_set_de_type(de, S_IFDIR);
1232
1233 return de;
1072} 1234}
1073 1235
1074/* 1236/*
@@ -1086,8 +1248,8 @@ static int ocfs2_fill_new_dir_id(struct ocfs2_super *osb,
1086 struct ocfs2_inline_data *data = &di->id2.i_data; 1248 struct ocfs2_inline_data *data = &di->id2.i_data;
1087 unsigned int size = le16_to_cpu(data->id_count); 1249 unsigned int size = le16_to_cpu(data->id_count);
1088 1250
1089 ret = ocfs2_journal_access(handle, inode, di_bh, 1251 ret = ocfs2_journal_access_di(handle, inode, di_bh,
1090 OCFS2_JOURNAL_ACCESS_WRITE); 1252 OCFS2_JOURNAL_ACCESS_WRITE);
1091 if (ret) { 1253 if (ret) {
1092 mlog_errno(ret); 1254 mlog_errno(ret);
1093 goto out; 1255 goto out;
@@ -1121,10 +1283,15 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
1121 struct ocfs2_alloc_context *data_ac) 1283 struct ocfs2_alloc_context *data_ac)
1122{ 1284{
1123 int status; 1285 int status;
1286 unsigned int size = osb->sb->s_blocksize;
1124 struct buffer_head *new_bh = NULL; 1287 struct buffer_head *new_bh = NULL;
1288 struct ocfs2_dir_entry *de;
1125 1289
1126 mlog_entry_void(); 1290 mlog_entry_void();
1127 1291
1292 if (ocfs2_supports_dir_trailer(osb))
1293 size = ocfs2_dir_trailer_blk_off(parent->i_sb);
1294
1128 status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh, 1295 status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh,
1129 data_ac, NULL, &new_bh); 1296 data_ac, NULL, &new_bh);
1130 if (status < 0) { 1297 if (status < 0) {
@@ -1134,16 +1301,17 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
1134 1301
1135 ocfs2_set_new_buffer_uptodate(inode, new_bh); 1302 ocfs2_set_new_buffer_uptodate(inode, new_bh);
1136 1303
1137 status = ocfs2_journal_access(handle, inode, new_bh, 1304 status = ocfs2_journal_access_db(handle, inode, new_bh,
1138 OCFS2_JOURNAL_ACCESS_CREATE); 1305 OCFS2_JOURNAL_ACCESS_CREATE);
1139 if (status < 0) { 1306 if (status < 0) {
1140 mlog_errno(status); 1307 mlog_errno(status);
1141 goto bail; 1308 goto bail;
1142 } 1309 }
1143 memset(new_bh->b_data, 0, osb->sb->s_blocksize); 1310 memset(new_bh->b_data, 0, osb->sb->s_blocksize);
1144 1311
1145 ocfs2_fill_initial_dirents(inode, parent, new_bh->b_data, 1312 de = ocfs2_fill_initial_dirents(inode, parent, new_bh->b_data, size);
1146 osb->sb->s_blocksize); 1313 if (ocfs2_supports_dir_trailer(osb))
1314 ocfs2_init_dir_trailer(inode, new_bh);
1147 1315
1148 status = ocfs2_journal_dirty(handle, new_bh); 1316 status = ocfs2_journal_dirty(handle, new_bh);
1149 if (status < 0) { 1317 if (status < 0) {
@@ -1184,13 +1352,27 @@ int ocfs2_fill_new_dir(struct ocfs2_super *osb,
1184 data_ac); 1352 data_ac);
1185} 1353}
1186 1354
1355/*
1356 * Expand rec_len of the rightmost dirent in a directory block so that it
1357 * contains the end of our valid space for dirents. We do this during
1358 * expansion from an inline directory to one with extents. The first dir block
1359 * in that case is taken from the inline data portion of the inode block.
1360 *
1361 * We add the dir trailer if this filesystem wants it.
1362 */
1187static void ocfs2_expand_last_dirent(char *start, unsigned int old_size, 1363static void ocfs2_expand_last_dirent(char *start, unsigned int old_size,
1188 unsigned int new_size) 1364 struct super_block *sb)
1189{ 1365{
1190 struct ocfs2_dir_entry *de; 1366 struct ocfs2_dir_entry *de;
1191 struct ocfs2_dir_entry *prev_de; 1367 struct ocfs2_dir_entry *prev_de;
1192 char *de_buf, *limit; 1368 char *de_buf, *limit;
1193 unsigned int bytes = new_size - old_size; 1369 unsigned int new_size = sb->s_blocksize;
1370 unsigned int bytes;
1371
1372 if (ocfs2_supports_dir_trailer(OCFS2_SB(sb)))
1373 new_size = ocfs2_dir_trailer_blk_off(sb);
1374
1375 bytes = new_size - old_size;
1194 1376
1195 limit = start + old_size; 1377 limit = start + old_size;
1196 de_buf = start; 1378 de_buf = start;
@@ -1216,9 +1398,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1216 unsigned int blocks_wanted, 1398 unsigned int blocks_wanted,
1217 struct buffer_head **first_block_bh) 1399 struct buffer_head **first_block_bh)
1218{ 1400{
1219 int ret, credits = OCFS2_INLINE_TO_EXTENTS_CREDITS;
1220 u32 alloc, bit_off, len; 1401 u32 alloc, bit_off, len;
1221 struct super_block *sb = dir->i_sb; 1402 struct super_block *sb = dir->i_sb;
1403 int ret, credits = ocfs2_inline_to_extents_credits(sb);
1222 u64 blkno, bytes = blocks_wanted << sb->s_blocksize_bits; 1404 u64 blkno, bytes = blocks_wanted << sb->s_blocksize_bits;
1223 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); 1405 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
1224 struct ocfs2_inode_info *oi = OCFS2_I(dir); 1406 struct ocfs2_inode_info *oi = OCFS2_I(dir);
@@ -1227,6 +1409,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1227 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 1409 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
1228 handle_t *handle; 1410 handle_t *handle;
1229 struct ocfs2_extent_tree et; 1411 struct ocfs2_extent_tree et;
1412 int did_quota = 0;
1230 1413
1231 ocfs2_init_dinode_extent_tree(&et, dir, di_bh); 1414 ocfs2_init_dinode_extent_tree(&et, dir, di_bh);
1232 1415
@@ -1264,6 +1447,12 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1264 goto out_sem; 1447 goto out_sem;
1265 } 1448 }
1266 1449
1450 if (vfs_dq_alloc_space_nodirty(dir,
1451 ocfs2_clusters_to_bytes(osb->sb, alloc))) {
1452 ret = -EDQUOT;
1453 goto out_commit;
1454 }
1455 did_quota = 1;
1267 /* 1456 /*
1268 * Try to claim as many clusters as the bitmap can give though 1457 * Try to claim as many clusters as the bitmap can give though
1269 * if we only get one now, that's enough to continue. The rest 1458 * if we only get one now, that's enough to continue. The rest
@@ -1290,8 +1479,8 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1290 1479
1291 ocfs2_set_new_buffer_uptodate(dir, dirdata_bh); 1480 ocfs2_set_new_buffer_uptodate(dir, dirdata_bh);
1292 1481
1293 ret = ocfs2_journal_access(handle, dir, dirdata_bh, 1482 ret = ocfs2_journal_access_db(handle, dir, dirdata_bh,
1294 OCFS2_JOURNAL_ACCESS_CREATE); 1483 OCFS2_JOURNAL_ACCESS_CREATE);
1295 if (ret) { 1484 if (ret) {
1296 mlog_errno(ret); 1485 mlog_errno(ret);
1297 goto out_commit; 1486 goto out_commit;
@@ -1300,8 +1489,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1300 memcpy(dirdata_bh->b_data, di->id2.i_data.id_data, i_size_read(dir)); 1489 memcpy(dirdata_bh->b_data, di->id2.i_data.id_data, i_size_read(dir));
1301 memset(dirdata_bh->b_data + i_size_read(dir), 0, 1490 memset(dirdata_bh->b_data + i_size_read(dir), 0,
1302 sb->s_blocksize - i_size_read(dir)); 1491 sb->s_blocksize - i_size_read(dir));
1303 ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir), 1492 ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir), sb);
1304 sb->s_blocksize); 1493 if (ocfs2_supports_dir_trailer(osb))
1494 ocfs2_init_dir_trailer(dir, dirdata_bh);
1305 1495
1306 ret = ocfs2_journal_dirty(handle, dirdata_bh); 1496 ret = ocfs2_journal_dirty(handle, dirdata_bh);
1307 if (ret) { 1497 if (ret) {
@@ -1317,8 +1507,8 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1317 * We let the later dirent insert modify c/mtime - to the user 1507 * We let the later dirent insert modify c/mtime - to the user
1318 * the data hasn't changed. 1508 * the data hasn't changed.
1319 */ 1509 */
1320 ret = ocfs2_journal_access(handle, dir, di_bh, 1510 ret = ocfs2_journal_access_di(handle, dir, di_bh,
1321 OCFS2_JOURNAL_ACCESS_CREATE); 1511 OCFS2_JOURNAL_ACCESS_CREATE);
1322 if (ret) { 1512 if (ret) {
1323 mlog_errno(ret); 1513 mlog_errno(ret);
1324 goto out_commit; 1514 goto out_commit;
@@ -1386,6 +1576,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
1386 dirdata_bh = NULL; 1576 dirdata_bh = NULL;
1387 1577
1388out_commit: 1578out_commit:
1579 if (ret < 0 && did_quota)
1580 vfs_dq_free_space_nodirty(dir,
1581 ocfs2_clusters_to_bytes(osb->sb, 2));
1389 ocfs2_commit_trans(osb, handle); 1582 ocfs2_commit_trans(osb, handle);
1390 1583
1391out_sem: 1584out_sem:
@@ -1410,7 +1603,7 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
1410 struct buffer_head **new_bh) 1603 struct buffer_head **new_bh)
1411{ 1604{
1412 int status; 1605 int status;
1413 int extend; 1606 int extend, did_quota = 0;
1414 u64 p_blkno, v_blkno; 1607 u64 p_blkno, v_blkno;
1415 1608
1416 spin_lock(&OCFS2_I(dir)->ip_lock); 1609 spin_lock(&OCFS2_I(dir)->ip_lock);
@@ -1420,6 +1613,13 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
1420 if (extend) { 1613 if (extend) {
1421 u32 offset = OCFS2_I(dir)->ip_clusters; 1614 u32 offset = OCFS2_I(dir)->ip_clusters;
1422 1615
1616 if (vfs_dq_alloc_space_nodirty(dir,
1617 ocfs2_clusters_to_bytes(sb, 1))) {
1618 status = -EDQUOT;
1619 goto bail;
1620 }
1621 did_quota = 1;
1622
1423 status = ocfs2_add_inode_data(OCFS2_SB(sb), dir, &offset, 1623 status = ocfs2_add_inode_data(OCFS2_SB(sb), dir, &offset,
1424 1, 0, parent_fe_bh, handle, 1624 1, 0, parent_fe_bh, handle,
1425 data_ac, meta_ac, NULL); 1625 data_ac, meta_ac, NULL);
@@ -1445,6 +1645,8 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
1445 } 1645 }
1446 status = 0; 1646 status = 0;
1447bail: 1647bail:
1648 if (did_quota && status < 0)
1649 vfs_dq_free_space_nodirty(dir, ocfs2_clusters_to_bytes(sb, 1));
1448 mlog_exit(status); 1650 mlog_exit(status);
1449 return status; 1651 return status;
1450} 1652}
@@ -1569,16 +1771,22 @@ do_extend:
1569 1771
1570 ocfs2_set_new_buffer_uptodate(dir, new_bh); 1772 ocfs2_set_new_buffer_uptodate(dir, new_bh);
1571 1773
1572 status = ocfs2_journal_access(handle, dir, new_bh, 1774 status = ocfs2_journal_access_db(handle, dir, new_bh,
1573 OCFS2_JOURNAL_ACCESS_CREATE); 1775 OCFS2_JOURNAL_ACCESS_CREATE);
1574 if (status < 0) { 1776 if (status < 0) {
1575 mlog_errno(status); 1777 mlog_errno(status);
1576 goto bail; 1778 goto bail;
1577 } 1779 }
1578 memset(new_bh->b_data, 0, sb->s_blocksize); 1780 memset(new_bh->b_data, 0, sb->s_blocksize);
1781
1579 de = (struct ocfs2_dir_entry *) new_bh->b_data; 1782 de = (struct ocfs2_dir_entry *) new_bh->b_data;
1580 de->inode = 0; 1783 de->inode = 0;
1581 de->rec_len = cpu_to_le16(sb->s_blocksize); 1784 if (ocfs2_dir_has_trailer(dir)) {
1785 de->rec_len = cpu_to_le16(ocfs2_dir_trailer_blk_off(sb));
1786 ocfs2_init_dir_trailer(dir, new_bh);
1787 } else {
1788 de->rec_len = cpu_to_le16(sb->s_blocksize);
1789 }
1582 status = ocfs2_journal_dirty(handle, new_bh); 1790 status = ocfs2_journal_dirty(handle, new_bh);
1583 if (status < 0) { 1791 if (status < 0) {
1584 mlog_errno(status); 1792 mlog_errno(status);
@@ -1620,11 +1828,21 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh,
1620 unsigned int *blocks_wanted) 1828 unsigned int *blocks_wanted)
1621{ 1829{
1622 int ret; 1830 int ret;
1831 struct super_block *sb = dir->i_sb;
1623 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 1832 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
1624 struct ocfs2_dir_entry *de, *last_de = NULL; 1833 struct ocfs2_dir_entry *de, *last_de = NULL;
1625 char *de_buf, *limit; 1834 char *de_buf, *limit;
1626 unsigned long offset = 0; 1835 unsigned long offset = 0;
1627 unsigned int rec_len, new_rec_len; 1836 unsigned int rec_len, new_rec_len, free_space = dir->i_sb->s_blocksize;
1837
1838 /*
1839 * This calculates how many free bytes we'd have in block zero, should
1840 * this function force expansion to an extent tree.
1841 */
1842 if (ocfs2_supports_dir_trailer(OCFS2_SB(sb)))
1843 free_space = ocfs2_dir_trailer_blk_off(sb) - i_size_read(dir);
1844 else
1845 free_space = dir->i_sb->s_blocksize - i_size_read(dir);
1628 1846
1629 de_buf = di->id2.i_data.id_data; 1847 de_buf = di->id2.i_data.id_data;
1630 limit = de_buf + i_size_read(dir); 1848 limit = de_buf + i_size_read(dir);
@@ -1641,6 +1859,11 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh,
1641 ret = -EEXIST; 1859 ret = -EEXIST;
1642 goto out; 1860 goto out;
1643 } 1861 }
1862 /*
1863 * No need to check for a trailing dirent record here as
1864 * they're not used for inline dirs.
1865 */
1866
1644 if (ocfs2_dirent_would_fit(de, rec_len)) { 1867 if (ocfs2_dirent_would_fit(de, rec_len)) {
1645 /* Ok, we found a spot. Return this bh and let 1868 /* Ok, we found a spot. Return this bh and let
1646 * the caller actually fill it in. */ 1869 * the caller actually fill it in. */
@@ -1661,7 +1884,7 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh,
1661 * dirent can be found. 1884 * dirent can be found.
1662 */ 1885 */
1663 *blocks_wanted = 1; 1886 *blocks_wanted = 1;
1664 new_rec_len = le16_to_cpu(last_de->rec_len) + (dir->i_sb->s_blocksize - i_size_read(dir)); 1887 new_rec_len = le16_to_cpu(last_de->rec_len) + free_space;
1665 if (new_rec_len < (rec_len + OCFS2_DIR_REC_LEN(last_de->name_len))) 1888 if (new_rec_len < (rec_len + OCFS2_DIR_REC_LEN(last_de->name_len)))
1666 *blocks_wanted = 2; 1889 *blocks_wanted = 2;
1667 1890
@@ -1679,9 +1902,10 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
1679 struct ocfs2_dir_entry *de; 1902 struct ocfs2_dir_entry *de;
1680 struct super_block *sb = dir->i_sb; 1903 struct super_block *sb = dir->i_sb;
1681 int status; 1904 int status;
1905 int blocksize = dir->i_sb->s_blocksize;
1682 1906
1683 bh = ocfs2_bread(dir, 0, &status, 0); 1907 status = ocfs2_read_dir_block(dir, 0, &bh, 0);
1684 if (!bh) { 1908 if (status) {
1685 mlog_errno(status); 1909 mlog_errno(status);
1686 goto bail; 1910 goto bail;
1687 } 1911 }
@@ -1702,11 +1926,10 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
1702 status = -ENOSPC; 1926 status = -ENOSPC;
1703 goto bail; 1927 goto bail;
1704 } 1928 }
1705 bh = ocfs2_bread(dir, 1929 status = ocfs2_read_dir_block(dir,
1706 offset >> sb->s_blocksize_bits, 1930 offset >> sb->s_blocksize_bits,
1707 &status, 1931 &bh, 0);
1708 0); 1932 if (status) {
1709 if (!bh) {
1710 mlog_errno(status); 1933 mlog_errno(status);
1711 goto bail; 1934 goto bail;
1712 } 1935 }
@@ -1721,6 +1944,11 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
1721 status = -EEXIST; 1944 status = -EEXIST;
1722 goto bail; 1945 goto bail;
1723 } 1946 }
1947
1948 if (ocfs2_skip_dir_trailer(dir, de, offset % blocksize,
1949 blocksize))
1950 goto next;
1951
1724 if (ocfs2_dirent_would_fit(de, rec_len)) { 1952 if (ocfs2_dirent_would_fit(de, rec_len)) {
1725 /* Ok, we found a spot. Return this bh and let 1953 /* Ok, we found a spot. Return this bh and let
1726 * the caller actually fill it in. */ 1954 * the caller actually fill it in. */
@@ -1729,6 +1957,7 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
1729 status = 0; 1957 status = 0;
1730 goto bail; 1958 goto bail;
1731 } 1959 }
1960next:
1732 offset += le16_to_cpu(de->rec_len); 1961 offset += le16_to_cpu(de->rec_len);
1733 de = (struct ocfs2_dir_entry *)((char *) de + le16_to_cpu(de->rec_len)); 1962 de = (struct ocfs2_dir_entry *)((char *) de + le16_to_cpu(de->rec_len));
1734 } 1963 }
diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h
index ce48b9080d87..c511e2e18e9f 100644
--- a/fs/ocfs2/dir.h
+++ b/fs/ocfs2/dir.h
@@ -83,4 +83,6 @@ int ocfs2_fill_new_dir(struct ocfs2_super *osb,
83 struct buffer_head *fe_bh, 83 struct buffer_head *fe_bh,
84 struct ocfs2_alloc_context *data_ac); 84 struct ocfs2_alloc_context *data_ac);
85 85
86struct ocfs2_dir_block_trailer *ocfs2_dir_trailer_from_size(int blocksize,
87 void *data);
86#endif /* OCFS2_DIR_H */ 88#endif /* OCFS2_DIR_H */
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index 644bee55d8ba..d07ddbe4b283 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -275,6 +275,7 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
275 struct list_head *iter, *head=NULL; 275 struct list_head *iter, *head=NULL;
276 u64 cookie; 276 u64 cookie;
277 u32 flags; 277 u32 flags;
278 u8 node;
278 279
279 if (!dlm_grab(dlm)) { 280 if (!dlm_grab(dlm)) {
280 dlm_error(DLM_REJECTED); 281 dlm_error(DLM_REJECTED);
@@ -286,18 +287,21 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
286 287
287 name = past->name; 288 name = past->name;
288 locklen = past->namelen; 289 locklen = past->namelen;
289 cookie = be64_to_cpu(past->cookie); 290 cookie = past->cookie;
290 flags = be32_to_cpu(past->flags); 291 flags = be32_to_cpu(past->flags);
292 node = past->node_idx;
291 293
292 if (locklen > DLM_LOCKID_NAME_MAX) { 294 if (locklen > DLM_LOCKID_NAME_MAX) {
293 ret = DLM_IVBUFLEN; 295 ret = DLM_IVBUFLEN;
294 mlog(ML_ERROR, "Invalid name length in proxy ast handler!\n"); 296 mlog(ML_ERROR, "Invalid name length (%d) in proxy ast "
297 "handler!\n", locklen);
295 goto leave; 298 goto leave;
296 } 299 }
297 300
298 if ((flags & (LKM_PUT_LVB|LKM_GET_LVB)) == 301 if ((flags & (LKM_PUT_LVB|LKM_GET_LVB)) ==
299 (LKM_PUT_LVB|LKM_GET_LVB)) { 302 (LKM_PUT_LVB|LKM_GET_LVB)) {
300 mlog(ML_ERROR, "both PUT and GET lvb specified\n"); 303 mlog(ML_ERROR, "Both PUT and GET lvb specified, (0x%x)\n",
304 flags);
301 ret = DLM_BADARGS; 305 ret = DLM_BADARGS;
302 goto leave; 306 goto leave;
303 } 307 }
@@ -310,22 +314,21 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
310 if (past->type != DLM_AST && 314 if (past->type != DLM_AST &&
311 past->type != DLM_BAST) { 315 past->type != DLM_BAST) {
312 mlog(ML_ERROR, "Unknown ast type! %d, cookie=%u:%llu" 316 mlog(ML_ERROR, "Unknown ast type! %d, cookie=%u:%llu"
313 "name=%.*s\n", past->type, 317 "name=%.*s, node=%u\n", past->type,
314 dlm_get_lock_cookie_node(cookie), 318 dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
315 dlm_get_lock_cookie_seq(cookie), 319 dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
316 locklen, name); 320 locklen, name, node);
317 ret = DLM_IVLOCKID; 321 ret = DLM_IVLOCKID;
318 goto leave; 322 goto leave;
319 } 323 }
320 324
321 res = dlm_lookup_lockres(dlm, name, locklen); 325 res = dlm_lookup_lockres(dlm, name, locklen);
322 if (!res) { 326 if (!res) {
323 mlog(0, "got %sast for unknown lockres! " 327 mlog(0, "Got %sast for unknown lockres! cookie=%u:%llu, "
324 "cookie=%u:%llu, name=%.*s, namelen=%u\n", 328 "name=%.*s, node=%u\n", (past->type == DLM_AST ? "" : "b"),
325 past->type == DLM_AST ? "" : "b", 329 dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
326 dlm_get_lock_cookie_node(cookie), 330 dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
327 dlm_get_lock_cookie_seq(cookie), 331 locklen, name, node);
328 locklen, name, locklen);
329 ret = DLM_IVLOCKID; 332 ret = DLM_IVLOCKID;
330 goto leave; 333 goto leave;
331 } 334 }
@@ -337,12 +340,12 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
337 340
338 spin_lock(&res->spinlock); 341 spin_lock(&res->spinlock);
339 if (res->state & DLM_LOCK_RES_RECOVERING) { 342 if (res->state & DLM_LOCK_RES_RECOVERING) {
340 mlog(0, "responding with DLM_RECOVERING!\n"); 343 mlog(0, "Responding with DLM_RECOVERING!\n");
341 ret = DLM_RECOVERING; 344 ret = DLM_RECOVERING;
342 goto unlock_out; 345 goto unlock_out;
343 } 346 }
344 if (res->state & DLM_LOCK_RES_MIGRATING) { 347 if (res->state & DLM_LOCK_RES_MIGRATING) {
345 mlog(0, "responding with DLM_MIGRATING!\n"); 348 mlog(0, "Responding with DLM_MIGRATING!\n");
346 ret = DLM_MIGRATING; 349 ret = DLM_MIGRATING;
347 goto unlock_out; 350 goto unlock_out;
348 } 351 }
@@ -351,7 +354,7 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
351 lock = NULL; 354 lock = NULL;
352 list_for_each(iter, head) { 355 list_for_each(iter, head) {
353 lock = list_entry (iter, struct dlm_lock, list); 356 lock = list_entry (iter, struct dlm_lock, list);
354 if (be64_to_cpu(lock->ml.cookie) == cookie) 357 if (lock->ml.cookie == cookie)
355 goto do_ast; 358 goto do_ast;
356 } 359 }
357 360
@@ -363,15 +366,15 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
363 366
364 list_for_each(iter, head) { 367 list_for_each(iter, head) {
365 lock = list_entry (iter, struct dlm_lock, list); 368 lock = list_entry (iter, struct dlm_lock, list);
366 if (be64_to_cpu(lock->ml.cookie) == cookie) 369 if (lock->ml.cookie == cookie)
367 goto do_ast; 370 goto do_ast;
368 } 371 }
369 372
370 mlog(0, "got %sast for unknown lock! cookie=%u:%llu, " 373 mlog(0, "Got %sast for unknown lock! cookie=%u:%llu, name=%.*s, "
371 "name=%.*s, namelen=%u\n", past->type == DLM_AST ? "" : "b", 374 "node=%u\n", past->type == DLM_AST ? "" : "b",
372 dlm_get_lock_cookie_node(cookie), 375 dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
373 dlm_get_lock_cookie_seq(cookie), 376 dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
374 locklen, name, locklen); 377 locklen, name, node);
375 378
376 ret = DLM_NORMAL; 379 ret = DLM_NORMAL;
377unlock_out: 380unlock_out:
@@ -383,8 +386,8 @@ do_ast:
383 if (past->type == DLM_AST) { 386 if (past->type == DLM_AST) {
384 /* do not alter lock refcount. switching lists. */ 387 /* do not alter lock refcount. switching lists. */
385 list_move_tail(&lock->list, &res->granted); 388 list_move_tail(&lock->list, &res->granted);
386 mlog(0, "ast: adding to granted list... type=%d, " 389 mlog(0, "ast: Adding to granted list... type=%d, "
387 "convert_type=%d\n", lock->ml.type, lock->ml.convert_type); 390 "convert_type=%d\n", lock->ml.type, lock->ml.convert_type);
388 if (lock->ml.convert_type != LKM_IVMODE) { 391 if (lock->ml.convert_type != LKM_IVMODE) {
389 lock->ml.type = lock->ml.convert_type; 392 lock->ml.type = lock->ml.convert_type;
390 lock->ml.convert_type = LKM_IVMODE; 393 lock->ml.convert_type = LKM_IVMODE;
@@ -408,7 +411,6 @@ do_ast:
408 dlm_do_local_bast(dlm, res, lock, past->blocked_type); 411 dlm_do_local_bast(dlm, res, lock, past->blocked_type);
409 412
410leave: 413leave:
411
412 if (res) 414 if (res)
413 dlm_lockres_put(res); 415 dlm_lockres_put(res);
414 416
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index d5a86fb81a49..bb53714813ab 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -140,6 +140,7 @@ struct dlm_ctxt
140 unsigned int purge_count; 140 unsigned int purge_count;
141 spinlock_t spinlock; 141 spinlock_t spinlock;
142 spinlock_t ast_lock; 142 spinlock_t ast_lock;
143 spinlock_t track_lock;
143 char *name; 144 char *name;
144 u8 node_num; 145 u8 node_num;
145 u32 key; 146 u32 key;
@@ -316,6 +317,8 @@ struct dlm_lock_resource
316 * put on a list for the dlm thread to run. */ 317 * put on a list for the dlm thread to run. */
317 unsigned long last_used; 318 unsigned long last_used;
318 319
320 struct dlm_ctxt *dlm;
321
319 unsigned migration_pending:1; 322 unsigned migration_pending:1;
320 atomic_t asts_reserved; 323 atomic_t asts_reserved;
321 spinlock_t spinlock; 324 spinlock_t spinlock;
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 1b81dcba175d..b32f60a5acfb 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -630,43 +630,38 @@ static void *lockres_seq_start(struct seq_file *m, loff_t *pos)
630{ 630{
631 struct debug_lockres *dl = m->private; 631 struct debug_lockres *dl = m->private;
632 struct dlm_ctxt *dlm = dl->dl_ctxt; 632 struct dlm_ctxt *dlm = dl->dl_ctxt;
633 struct dlm_lock_resource *oldres = dl->dl_res;
633 struct dlm_lock_resource *res = NULL; 634 struct dlm_lock_resource *res = NULL;
635 struct list_head *track_list;
634 636
635 spin_lock(&dlm->spinlock); 637 spin_lock(&dlm->track_lock);
638 if (oldres)
639 track_list = &oldres->tracking;
640 else
641 track_list = &dlm->tracking_list;
636 642
637 if (dl->dl_res) { 643 list_for_each_entry(res, track_list, tracking) {
638 list_for_each_entry(res, &dl->dl_res->tracking, tracking) { 644 if (&res->tracking == &dlm->tracking_list)
639 if (dl->dl_res) { 645 res = NULL;
640 dlm_lockres_put(dl->dl_res); 646 else
641 dl->dl_res = NULL;
642 }
643 if (&res->tracking == &dlm->tracking_list) {
644 mlog(0, "End of list found, %p\n", res);
645 dl = NULL;
646 break;
647 }
648 dlm_lockres_get(res); 647 dlm_lockres_get(res);
649 dl->dl_res = res; 648 break;
650 break;
651 }
652 } else {
653 if (!list_empty(&dlm->tracking_list)) {
654 list_for_each_entry(res, &dlm->tracking_list, tracking)
655 break;
656 dlm_lockres_get(res);
657 dl->dl_res = res;
658 } else
659 dl = NULL;
660 } 649 }
650 spin_unlock(&dlm->track_lock);
661 651
662 if (dl) { 652 if (oldres)
663 spin_lock(&dl->dl_res->spinlock); 653 dlm_lockres_put(oldres);
664 dump_lockres(dl->dl_res, dl->dl_buf, dl->dl_len - 1);
665 spin_unlock(&dl->dl_res->spinlock);
666 }
667 654
668 spin_unlock(&dlm->spinlock); 655 dl->dl_res = res;
656
657 if (res) {
658 spin_lock(&res->spinlock);
659 dump_lockres(res, dl->dl_buf, dl->dl_len - 1);
660 spin_unlock(&res->spinlock);
661 } else
662 dl = NULL;
669 663
664 /* passed to seq_show */
670 return dl; 665 return dl;
671} 666}
672 667
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 63f8125824e8..d8d578f45613 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -1550,6 +1550,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
1550 spin_lock_init(&dlm->spinlock); 1550 spin_lock_init(&dlm->spinlock);
1551 spin_lock_init(&dlm->master_lock); 1551 spin_lock_init(&dlm->master_lock);
1552 spin_lock_init(&dlm->ast_lock); 1552 spin_lock_init(&dlm->ast_lock);
1553 spin_lock_init(&dlm->track_lock);
1553 INIT_LIST_HEAD(&dlm->list); 1554 INIT_LIST_HEAD(&dlm->list);
1554 INIT_LIST_HEAD(&dlm->dirty_list); 1555 INIT_LIST_HEAD(&dlm->dirty_list);
1555 INIT_LIST_HEAD(&dlm->reco.resources); 1556 INIT_LIST_HEAD(&dlm->reco.resources);
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
index 6f7a77d54020..1c9efb406a96 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -341,7 +341,6 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb)
341 inode->i_mode = mode; 341 inode->i_mode = mode;
342 inode->i_uid = current_fsuid(); 342 inode->i_uid = current_fsuid();
343 inode->i_gid = current_fsgid(); 343 inode->i_gid = current_fsgid();
344 inode->i_blocks = 0;
345 inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info; 344 inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
346 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 345 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
347 inc_nlink(inode); 346 inc_nlink(inode);
@@ -367,7 +366,6 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
367 inode->i_mode = mode; 366 inode->i_mode = mode;
368 inode->i_uid = current_fsuid(); 367 inode->i_uid = current_fsuid();
369 inode->i_gid = current_fsgid(); 368 inode->i_gid = current_fsgid();
370 inode->i_blocks = 0;
371 inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info; 369 inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
372 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 370 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
373 371
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 44f87caf3683..54e182a27caf 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -505,8 +505,10 @@ void dlm_change_lockres_owner(struct dlm_ctxt *dlm,
505static void dlm_lockres_release(struct kref *kref) 505static void dlm_lockres_release(struct kref *kref)
506{ 506{
507 struct dlm_lock_resource *res; 507 struct dlm_lock_resource *res;
508 struct dlm_ctxt *dlm;
508 509
509 res = container_of(kref, struct dlm_lock_resource, refs); 510 res = container_of(kref, struct dlm_lock_resource, refs);
511 dlm = res->dlm;
510 512
511 /* This should not happen -- all lockres' have a name 513 /* This should not happen -- all lockres' have a name
512 * associated with them at init time. */ 514 * associated with them at init time. */
@@ -515,6 +517,7 @@ static void dlm_lockres_release(struct kref *kref)
515 mlog(0, "destroying lockres %.*s\n", res->lockname.len, 517 mlog(0, "destroying lockres %.*s\n", res->lockname.len,
516 res->lockname.name); 518 res->lockname.name);
517 519
520 spin_lock(&dlm->track_lock);
518 if (!list_empty(&res->tracking)) 521 if (!list_empty(&res->tracking))
519 list_del_init(&res->tracking); 522 list_del_init(&res->tracking);
520 else { 523 else {
@@ -522,6 +525,9 @@ static void dlm_lockres_release(struct kref *kref)
522 res->lockname.len, res->lockname.name); 525 res->lockname.len, res->lockname.name);
523 dlm_print_one_lock_resource(res); 526 dlm_print_one_lock_resource(res);
524 } 527 }
528 spin_unlock(&dlm->track_lock);
529
530 dlm_put(dlm);
525 531
526 if (!hlist_unhashed(&res->hash_node) || 532 if (!hlist_unhashed(&res->hash_node) ||
527 !list_empty(&res->granted) || 533 !list_empty(&res->granted) ||
@@ -595,6 +601,10 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
595 res->migration_pending = 0; 601 res->migration_pending = 0;
596 res->inflight_locks = 0; 602 res->inflight_locks = 0;
597 603
604 /* put in dlm_lockres_release */
605 dlm_grab(dlm);
606 res->dlm = dlm;
607
598 kref_init(&res->refs); 608 kref_init(&res->refs);
599 609
600 /* just for consistency */ 610 /* just for consistency */
@@ -722,14 +732,21 @@ lookup:
722 if (tmpres) { 732 if (tmpres) {
723 int dropping_ref = 0; 733 int dropping_ref = 0;
724 734
735 spin_unlock(&dlm->spinlock);
736
725 spin_lock(&tmpres->spinlock); 737 spin_lock(&tmpres->spinlock);
738 /* We wait for the other thread that is mastering the resource */
739 if (tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
740 __dlm_wait_on_lockres(tmpres);
741 BUG_ON(tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN);
742 }
743
726 if (tmpres->owner == dlm->node_num) { 744 if (tmpres->owner == dlm->node_num) {
727 BUG_ON(tmpres->state & DLM_LOCK_RES_DROPPING_REF); 745 BUG_ON(tmpres->state & DLM_LOCK_RES_DROPPING_REF);
728 dlm_lockres_grab_inflight_ref(dlm, tmpres); 746 dlm_lockres_grab_inflight_ref(dlm, tmpres);
729 } else if (tmpres->state & DLM_LOCK_RES_DROPPING_REF) 747 } else if (tmpres->state & DLM_LOCK_RES_DROPPING_REF)
730 dropping_ref = 1; 748 dropping_ref = 1;
731 spin_unlock(&tmpres->spinlock); 749 spin_unlock(&tmpres->spinlock);
732 spin_unlock(&dlm->spinlock);
733 750
734 /* wait until done messaging the master, drop our ref to allow 751 /* wait until done messaging the master, drop our ref to allow
735 * the lockres to be purged, start over. */ 752 * the lockres to be purged, start over. */
@@ -2949,7 +2966,7 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
2949 struct dlm_node_iter *iter) 2966 struct dlm_node_iter *iter)
2950{ 2967{
2951 struct dlm_migrate_request migrate; 2968 struct dlm_migrate_request migrate;
2952 int ret, status = 0; 2969 int ret, skip, status = 0;
2953 int nodenum; 2970 int nodenum;
2954 2971
2955 memset(&migrate, 0, sizeof(migrate)); 2972 memset(&migrate, 0, sizeof(migrate));
@@ -2966,12 +2983,27 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
2966 nodenum == new_master) 2983 nodenum == new_master)
2967 continue; 2984 continue;
2968 2985
2986 /* We could race exit domain. If exited, skip. */
2987 spin_lock(&dlm->spinlock);
2988 skip = (!test_bit(nodenum, dlm->domain_map));
2989 spin_unlock(&dlm->spinlock);
2990 if (skip) {
2991 clear_bit(nodenum, iter->node_map);
2992 continue;
2993 }
2994
2969 ret = o2net_send_message(DLM_MIGRATE_REQUEST_MSG, dlm->key, 2995 ret = o2net_send_message(DLM_MIGRATE_REQUEST_MSG, dlm->key,
2970 &migrate, sizeof(migrate), nodenum, 2996 &migrate, sizeof(migrate), nodenum,
2971 &status); 2997 &status);
2972 if (ret < 0) 2998 if (ret < 0) {
2973 mlog_errno(ret); 2999 mlog(0, "migrate_request returned %d!\n", ret);
2974 else if (status < 0) { 3000 if (!dlm_is_host_down(ret)) {
3001 mlog(ML_ERROR, "unhandled error=%d!\n", ret);
3002 BUG();
3003 }
3004 clear_bit(nodenum, iter->node_map);
3005 ret = 0;
3006 } else if (status < 0) {
2975 mlog(0, "migrate request (node %u) returned %d!\n", 3007 mlog(0, "migrate request (node %u) returned %d!\n",
2976 nodenum, status); 3008 nodenum, status);
2977 ret = status; 3009 ret = status;
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 4060bb328bc8..d1295203029f 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -181,7 +181,8 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm,
181 181
182 spin_lock(&res->spinlock); 182 spin_lock(&res->spinlock);
183 /* This ensures that clear refmap is sent after the set */ 183 /* This ensures that clear refmap is sent after the set */
184 __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG); 184 __dlm_wait_on_lockres_flags(res, (DLM_LOCK_RES_SETREF_INPROG |
185 DLM_LOCK_RES_MIGRATING));
185 spin_unlock(&res->spinlock); 186 spin_unlock(&res->spinlock);
186 187
187 /* clear our bit from the master's refmap, ignore errors */ 188 /* clear our bit from the master's refmap, ignore errors */
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 6e6cc0a2e5f7..206a2370876a 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -32,6 +32,7 @@
32#include <linux/debugfs.h> 32#include <linux/debugfs.h>
33#include <linux/seq_file.h> 33#include <linux/seq_file.h>
34#include <linux/time.h> 34#include <linux/time.h>
35#include <linux/quotaops.h>
35 36
36#define MLOG_MASK_PREFIX ML_DLM_GLUE 37#define MLOG_MASK_PREFIX ML_DLM_GLUE
37#include <cluster/masklog.h> 38#include <cluster/masklog.h>
@@ -51,6 +52,7 @@
51#include "slot_map.h" 52#include "slot_map.h"
52#include "super.h" 53#include "super.h"
53#include "uptodate.h" 54#include "uptodate.h"
55#include "quota.h"
54 56
55#include "buffer_head_io.h" 57#include "buffer_head_io.h"
56 58
@@ -68,6 +70,7 @@ struct ocfs2_mask_waiter {
68static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres); 70static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres);
69static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres); 71static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres);
70static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres); 72static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres);
73static struct ocfs2_super *ocfs2_get_qinfo_osb(struct ocfs2_lock_res *lockres);
71 74
72/* 75/*
73 * Return value from ->downconvert_worker functions. 76 * Return value from ->downconvert_worker functions.
@@ -102,6 +105,7 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
102static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb, 105static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb,
103 struct ocfs2_lock_res *lockres); 106 struct ocfs2_lock_res *lockres);
104 107
108static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres);
105 109
106#define mlog_meta_lvb(__level, __lockres) ocfs2_dump_meta_lvb_info(__level, __PRETTY_FUNCTION__, __LINE__, __lockres) 110#define mlog_meta_lvb(__level, __lockres) ocfs2_dump_meta_lvb_info(__level, __PRETTY_FUNCTION__, __LINE__, __lockres)
107 111
@@ -111,8 +115,7 @@ static void ocfs2_dump_meta_lvb_info(u64 level,
111 unsigned int line, 115 unsigned int line,
112 struct ocfs2_lock_res *lockres) 116 struct ocfs2_lock_res *lockres)
113{ 117{
114 struct ocfs2_meta_lvb *lvb = 118 struct ocfs2_meta_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
115 (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
116 119
117 mlog(level, "LVB information for %s (called from %s:%u):\n", 120 mlog(level, "LVB information for %s (called from %s:%u):\n",
118 lockres->l_name, function, line); 121 lockres->l_name, function, line);
@@ -258,6 +261,12 @@ static struct ocfs2_lock_res_ops ocfs2_flock_lops = {
258 .flags = 0, 261 .flags = 0,
259}; 262};
260 263
264static struct ocfs2_lock_res_ops ocfs2_qinfo_lops = {
265 .set_lvb = ocfs2_set_qinfo_lvb,
266 .get_osb = ocfs2_get_qinfo_osb,
267 .flags = LOCK_TYPE_REQUIRES_REFRESH | LOCK_TYPE_USES_LVB,
268};
269
261static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres) 270static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
262{ 271{
263 return lockres->l_type == OCFS2_LOCK_TYPE_META || 272 return lockres->l_type == OCFS2_LOCK_TYPE_META ||
@@ -279,6 +288,13 @@ static inline struct ocfs2_dentry_lock *ocfs2_lock_res_dl(struct ocfs2_lock_res
279 return (struct ocfs2_dentry_lock *)lockres->l_priv; 288 return (struct ocfs2_dentry_lock *)lockres->l_priv;
280} 289}
281 290
291static inline struct ocfs2_mem_dqinfo *ocfs2_lock_res_qinfo(struct ocfs2_lock_res *lockres)
292{
293 BUG_ON(lockres->l_type != OCFS2_LOCK_TYPE_QINFO);
294
295 return (struct ocfs2_mem_dqinfo *)lockres->l_priv;
296}
297
282static inline struct ocfs2_super *ocfs2_get_lockres_osb(struct ocfs2_lock_res *lockres) 298static inline struct ocfs2_super *ocfs2_get_lockres_osb(struct ocfs2_lock_res *lockres)
283{ 299{
284 if (lockres->l_ops->get_osb) 300 if (lockres->l_ops->get_osb)
@@ -507,6 +523,13 @@ static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres)
507 return OCFS2_SB(inode->i_sb); 523 return OCFS2_SB(inode->i_sb);
508} 524}
509 525
526static struct ocfs2_super *ocfs2_get_qinfo_osb(struct ocfs2_lock_res *lockres)
527{
528 struct ocfs2_mem_dqinfo *info = lockres->l_priv;
529
530 return OCFS2_SB(info->dqi_gi.dqi_sb);
531}
532
510static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres) 533static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres)
511{ 534{
512 struct ocfs2_file_private *fp = lockres->l_priv; 535 struct ocfs2_file_private *fp = lockres->l_priv;
@@ -609,6 +632,17 @@ void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
609 lockres->l_flags |= OCFS2_LOCK_NOCACHE; 632 lockres->l_flags |= OCFS2_LOCK_NOCACHE;
610} 633}
611 634
635void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res *lockres,
636 struct ocfs2_mem_dqinfo *info)
637{
638 ocfs2_lock_res_init_once(lockres);
639 ocfs2_build_lock_name(OCFS2_LOCK_TYPE_QINFO, info->dqi_gi.dqi_type,
640 0, lockres->l_name);
641 ocfs2_lock_res_init_common(OCFS2_SB(info->dqi_gi.dqi_sb), lockres,
642 OCFS2_LOCK_TYPE_QINFO, &ocfs2_qinfo_lops,
643 info);
644}
645
612void ocfs2_lock_res_free(struct ocfs2_lock_res *res) 646void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
613{ 647{
614 mlog_entry_void(); 648 mlog_entry_void();
@@ -1290,7 +1324,7 @@ again:
1290 goto out; 1324 goto out;
1291 } 1325 }
1292 1326
1293 mlog(0, "lock %s, successfull return from ocfs2_dlm_lock\n", 1327 mlog(0, "lock %s, successful return from ocfs2_dlm_lock\n",
1294 lockres->l_name); 1328 lockres->l_name);
1295 1329
1296 /* At this point we've gone inside the dlm and need to 1330 /* At this point we've gone inside the dlm and need to
@@ -1829,7 +1863,7 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)
1829 1863
1830 mlog_entry_void(); 1864 mlog_entry_void();
1831 1865
1832 lvb = (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb); 1866 lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
1833 1867
1834 /* 1868 /*
1835 * Invalidate the LVB of a deleted inode - this way other 1869 * Invalidate the LVB of a deleted inode - this way other
@@ -1881,7 +1915,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
1881 1915
1882 mlog_meta_lvb(0, lockres); 1916 mlog_meta_lvb(0, lockres);
1883 1917
1884 lvb = (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb); 1918 lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
1885 1919
1886 /* We're safe here without the lockres lock... */ 1920 /* We're safe here without the lockres lock... */
1887 spin_lock(&oi->ip_lock); 1921 spin_lock(&oi->ip_lock);
@@ -1916,8 +1950,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
1916static inline int ocfs2_meta_lvb_is_trustable(struct inode *inode, 1950static inline int ocfs2_meta_lvb_is_trustable(struct inode *inode,
1917 struct ocfs2_lock_res *lockres) 1951 struct ocfs2_lock_res *lockres)
1918{ 1952{
1919 struct ocfs2_meta_lvb *lvb = 1953 struct ocfs2_meta_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
1920 (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
1921 1954
1922 if (lvb->lvb_version == OCFS2_LVB_VERSION 1955 if (lvb->lvb_version == OCFS2_LVB_VERSION
1923 && be32_to_cpu(lvb->lvb_igeneration) == inode->i_generation) 1956 && be32_to_cpu(lvb->lvb_igeneration) == inode->i_generation)
@@ -2024,7 +2057,7 @@ static int ocfs2_inode_lock_update(struct inode *inode,
2024 } else { 2057 } else {
2025 /* Boo, we have to go to disk. */ 2058 /* Boo, we have to go to disk. */
2026 /* read bh, cast, ocfs2_refresh_inode */ 2059 /* read bh, cast, ocfs2_refresh_inode */
2027 status = ocfs2_read_block(inode, oi->ip_blkno, bh); 2060 status = ocfs2_read_inode_block(inode, bh);
2028 if (status < 0) { 2061 if (status < 0) {
2029 mlog_errno(status); 2062 mlog_errno(status);
2030 goto bail_refresh; 2063 goto bail_refresh;
@@ -2032,18 +2065,14 @@ static int ocfs2_inode_lock_update(struct inode *inode,
2032 fe = (struct ocfs2_dinode *) (*bh)->b_data; 2065 fe = (struct ocfs2_dinode *) (*bh)->b_data;
2033 2066
2034 /* This is a good chance to make sure we're not 2067 /* This is a good chance to make sure we're not
2035 * locking an invalid object. 2068 * locking an invalid object. ocfs2_read_inode_block()
2069 * already checked that the inode block is sane.
2036 * 2070 *
2037 * We bug on a stale inode here because we checked 2071 * We bug on a stale inode here because we checked
2038 * above whether it was wiped from disk. The wiping 2072 * above whether it was wiped from disk. The wiping
2039 * node provides a guarantee that we receive that 2073 * node provides a guarantee that we receive that
2040 * message and can mark the inode before dropping any 2074 * message and can mark the inode before dropping any
2041 * locks associated with it. */ 2075 * locks associated with it. */
2042 if (!OCFS2_IS_VALID_DINODE(fe)) {
2043 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
2044 status = -EIO;
2045 goto bail_refresh;
2046 }
2047 mlog_bug_on_msg(inode->i_generation != 2076 mlog_bug_on_msg(inode->i_generation !=
2048 le32_to_cpu(fe->i_generation), 2077 le32_to_cpu(fe->i_generation),
2049 "Invalid dinode %llu disk generation: %u " 2078 "Invalid dinode %llu disk generation: %u "
@@ -2085,7 +2114,7 @@ static int ocfs2_assign_bh(struct inode *inode,
2085 return 0; 2114 return 0;
2086 } 2115 }
2087 2116
2088 status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, ret_bh); 2117 status = ocfs2_read_inode_block(inode, ret_bh);
2089 if (status < 0) 2118 if (status < 0)
2090 mlog_errno(status); 2119 mlog_errno(status);
2091 2120
@@ -2831,6 +2860,10 @@ static void ocfs2_unlock_ast(void *opaque, int error)
2831 case OCFS2_UNLOCK_CANCEL_CONVERT: 2860 case OCFS2_UNLOCK_CANCEL_CONVERT:
2832 mlog(0, "Cancel convert success for %s\n", lockres->l_name); 2861 mlog(0, "Cancel convert success for %s\n", lockres->l_name);
2833 lockres->l_action = OCFS2_AST_INVALID; 2862 lockres->l_action = OCFS2_AST_INVALID;
2863 /* Downconvert thread may have requeued this lock, we
2864 * need to wake it. */
2865 if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
2866 ocfs2_wake_downconvert_thread(ocfs2_get_lockres_osb(lockres));
2834 break; 2867 break;
2835 case OCFS2_UNLOCK_DROP_LOCK: 2868 case OCFS2_UNLOCK_DROP_LOCK:
2836 lockres->l_level = DLM_LOCK_IV; 2869 lockres->l_level = DLM_LOCK_IV;
@@ -2922,7 +2955,7 @@ static int ocfs2_drop_lock(struct ocfs2_super *osb,
2922 ocfs2_dlm_dump_lksb(&lockres->l_lksb); 2955 ocfs2_dlm_dump_lksb(&lockres->l_lksb);
2923 BUG(); 2956 BUG();
2924 } 2957 }
2925 mlog(0, "lock %s, successfull return from ocfs2_dlm_unlock\n", 2958 mlog(0, "lock %s, successful return from ocfs2_dlm_unlock\n",
2926 lockres->l_name); 2959 lockres->l_name);
2927 2960
2928 ocfs2_wait_on_busy_lock(lockres); 2961 ocfs2_wait_on_busy_lock(lockres);
@@ -3449,6 +3482,117 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
3449 return UNBLOCK_CONTINUE_POST; 3482 return UNBLOCK_CONTINUE_POST;
3450} 3483}
3451 3484
3485static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres)
3486{
3487 struct ocfs2_qinfo_lvb *lvb;
3488 struct ocfs2_mem_dqinfo *oinfo = ocfs2_lock_res_qinfo(lockres);
3489 struct mem_dqinfo *info = sb_dqinfo(oinfo->dqi_gi.dqi_sb,
3490 oinfo->dqi_gi.dqi_type);
3491
3492 mlog_entry_void();
3493
3494 lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
3495 lvb->lvb_version = OCFS2_QINFO_LVB_VERSION;
3496 lvb->lvb_bgrace = cpu_to_be32(info->dqi_bgrace);
3497 lvb->lvb_igrace = cpu_to_be32(info->dqi_igrace);
3498 lvb->lvb_syncms = cpu_to_be32(oinfo->dqi_syncms);
3499 lvb->lvb_blocks = cpu_to_be32(oinfo->dqi_gi.dqi_blocks);
3500 lvb->lvb_free_blk = cpu_to_be32(oinfo->dqi_gi.dqi_free_blk);
3501 lvb->lvb_free_entry = cpu_to_be32(oinfo->dqi_gi.dqi_free_entry);
3502
3503 mlog_exit_void();
3504}
3505
3506void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex)
3507{
3508 struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock;
3509 struct ocfs2_super *osb = OCFS2_SB(oinfo->dqi_gi.dqi_sb);
3510 int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
3511
3512 mlog_entry_void();
3513 if (!ocfs2_is_hard_readonly(osb) && !ocfs2_mount_local(osb))
3514 ocfs2_cluster_unlock(osb, lockres, level);
3515 mlog_exit_void();
3516}
3517
3518static int ocfs2_refresh_qinfo(struct ocfs2_mem_dqinfo *oinfo)
3519{
3520 struct mem_dqinfo *info = sb_dqinfo(oinfo->dqi_gi.dqi_sb,
3521 oinfo->dqi_gi.dqi_type);
3522 struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock;
3523 struct ocfs2_qinfo_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
3524 struct buffer_head *bh = NULL;
3525 struct ocfs2_global_disk_dqinfo *gdinfo;
3526 int status = 0;
3527
3528 if (lvb->lvb_version == OCFS2_QINFO_LVB_VERSION) {
3529 info->dqi_bgrace = be32_to_cpu(lvb->lvb_bgrace);
3530 info->dqi_igrace = be32_to_cpu(lvb->lvb_igrace);
3531 oinfo->dqi_syncms = be32_to_cpu(lvb->lvb_syncms);
3532 oinfo->dqi_gi.dqi_blocks = be32_to_cpu(lvb->lvb_blocks);
3533 oinfo->dqi_gi.dqi_free_blk = be32_to_cpu(lvb->lvb_free_blk);
3534 oinfo->dqi_gi.dqi_free_entry =
3535 be32_to_cpu(lvb->lvb_free_entry);
3536 } else {
3537 status = ocfs2_read_quota_block(oinfo->dqi_gqinode, 0, &bh);
3538 if (status) {
3539 mlog_errno(status);
3540 goto bail;
3541 }
3542 gdinfo = (struct ocfs2_global_disk_dqinfo *)
3543 (bh->b_data + OCFS2_GLOBAL_INFO_OFF);
3544 info->dqi_bgrace = le32_to_cpu(gdinfo->dqi_bgrace);
3545 info->dqi_igrace = le32_to_cpu(gdinfo->dqi_igrace);
3546 oinfo->dqi_syncms = le32_to_cpu(gdinfo->dqi_syncms);
3547 oinfo->dqi_gi.dqi_blocks = le32_to_cpu(gdinfo->dqi_blocks);
3548 oinfo->dqi_gi.dqi_free_blk = le32_to_cpu(gdinfo->dqi_free_blk);
3549 oinfo->dqi_gi.dqi_free_entry =
3550 le32_to_cpu(gdinfo->dqi_free_entry);
3551 brelse(bh);
3552 ocfs2_track_lock_refresh(lockres);
3553 }
3554
3555bail:
3556 return status;
3557}
3558
3559/* Lock quota info, this function expects at least shared lock on the quota file
3560 * so that we can safely refresh quota info from disk. */
3561int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex)
3562{
3563 struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock;
3564 struct ocfs2_super *osb = OCFS2_SB(oinfo->dqi_gi.dqi_sb);
3565 int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
3566 int status = 0;
3567
3568 mlog_entry_void();
3569
3570 /* On RO devices, locking really isn't needed... */
3571 if (ocfs2_is_hard_readonly(osb)) {
3572 if (ex)
3573 status = -EROFS;
3574 goto bail;
3575 }
3576 if (ocfs2_mount_local(osb))
3577 goto bail;
3578
3579 status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
3580 if (status < 0) {
3581 mlog_errno(status);
3582 goto bail;
3583 }
3584 if (!ocfs2_should_refresh_lock_res(lockres))
3585 goto bail;
3586 /* OK, we have the lock but we need to refresh the quota info */
3587 status = ocfs2_refresh_qinfo(oinfo);
3588 if (status)
3589 ocfs2_qinfo_unlock(oinfo, ex);
3590 ocfs2_complete_lock_res_refresh(lockres, status);
3591bail:
3592 mlog_exit(status);
3593 return status;
3594}
3595
3452/* 3596/*
3453 * This is the filesystem locking protocol. It provides the lock handling 3597 * This is the filesystem locking protocol. It provides the lock handling
3454 * hooks for the underlying DLM. It has a maximum version number. 3598 * hooks for the underlying DLM. It has a maximum version number.
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index 2bb01f09c1b1..3f8d9986b8e0 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -49,6 +49,19 @@ struct ocfs2_meta_lvb {
49 __be32 lvb_reserved2; 49 __be32 lvb_reserved2;
50}; 50};
51 51
52#define OCFS2_QINFO_LVB_VERSION 1
53
54struct ocfs2_qinfo_lvb {
55 __u8 lvb_version;
56 __u8 lvb_reserved[3];
57 __be32 lvb_bgrace;
58 __be32 lvb_igrace;
59 __be32 lvb_syncms;
60 __be32 lvb_blocks;
61 __be32 lvb_free_blk;
62 __be32 lvb_free_entry;
63};
64
52/* ocfs2_inode_lock_full() 'arg_flags' flags */ 65/* ocfs2_inode_lock_full() 'arg_flags' flags */
53/* don't wait on recovery. */ 66/* don't wait on recovery. */
54#define OCFS2_META_LOCK_RECOVERY (0x01) 67#define OCFS2_META_LOCK_RECOVERY (0x01)
@@ -69,6 +82,9 @@ void ocfs2_dentry_lock_res_init(struct ocfs2_dentry_lock *dl,
69struct ocfs2_file_private; 82struct ocfs2_file_private;
70void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres, 83void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
71 struct ocfs2_file_private *fp); 84 struct ocfs2_file_private *fp);
85struct ocfs2_mem_dqinfo;
86void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res *lockres,
87 struct ocfs2_mem_dqinfo *info);
72void ocfs2_lock_res_free(struct ocfs2_lock_res *res); 88void ocfs2_lock_res_free(struct ocfs2_lock_res *res);
73int ocfs2_create_new_inode_locks(struct inode *inode); 89int ocfs2_create_new_inode_locks(struct inode *inode);
74int ocfs2_drop_inode_locks(struct inode *inode); 90int ocfs2_drop_inode_locks(struct inode *inode);
@@ -103,6 +119,9 @@ int ocfs2_dentry_lock(struct dentry *dentry, int ex);
103void ocfs2_dentry_unlock(struct dentry *dentry, int ex); 119void ocfs2_dentry_unlock(struct dentry *dentry, int ex);
104int ocfs2_file_lock(struct file *file, int ex, int trylock); 120int ocfs2_file_lock(struct file *file, int ex, int trylock);
105void ocfs2_file_unlock(struct file *file); 121void ocfs2_file_unlock(struct file *file);
122int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex);
123void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex);
124
106 125
107void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres); 126void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres);
108void ocfs2_simple_drop_lockres(struct ocfs2_super *osb, 127void ocfs2_simple_drop_lockres(struct ocfs2_super *osb,
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 2baedac58234..f2bb1a04d253 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -293,7 +293,7 @@ static int ocfs2_last_eb_is_empty(struct inode *inode,
293 struct ocfs2_extent_block *eb; 293 struct ocfs2_extent_block *eb;
294 struct ocfs2_extent_list *el; 294 struct ocfs2_extent_list *el;
295 295
296 ret = ocfs2_read_block(inode, last_eb_blk, &eb_bh); 296 ret = ocfs2_read_extent_block(inode, last_eb_blk, &eb_bh);
297 if (ret) { 297 if (ret) {
298 mlog_errno(ret); 298 mlog_errno(ret);
299 goto out; 299 goto out;
@@ -302,12 +302,6 @@ static int ocfs2_last_eb_is_empty(struct inode *inode,
302 eb = (struct ocfs2_extent_block *) eb_bh->b_data; 302 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
303 el = &eb->h_list; 303 el = &eb->h_list;
304 304
305 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
306 ret = -EROFS;
307 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
308 goto out;
309 }
310
311 if (el->l_tree_depth) { 305 if (el->l_tree_depth) {
312 ocfs2_error(inode->i_sb, 306 ocfs2_error(inode->i_sb,
313 "Inode %lu has non zero tree depth in " 307 "Inode %lu has non zero tree depth in "
@@ -381,23 +375,16 @@ static int ocfs2_figure_hole_clusters(struct inode *inode,
381 if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL) 375 if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL)
382 goto no_more_extents; 376 goto no_more_extents;
383 377
384 ret = ocfs2_read_block(inode, 378 ret = ocfs2_read_extent_block(inode,
385 le64_to_cpu(eb->h_next_leaf_blk), 379 le64_to_cpu(eb->h_next_leaf_blk),
386 &next_eb_bh); 380 &next_eb_bh);
387 if (ret) { 381 if (ret) {
388 mlog_errno(ret); 382 mlog_errno(ret);
389 goto out; 383 goto out;
390 } 384 }
391 next_eb = (struct ocfs2_extent_block *)next_eb_bh->b_data;
392
393 if (!OCFS2_IS_VALID_EXTENT_BLOCK(next_eb)) {
394 ret = -EROFS;
395 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, next_eb);
396 goto out;
397 }
398 385
386 next_eb = (struct ocfs2_extent_block *)next_eb_bh->b_data;
399 el = &next_eb->h_list; 387 el = &next_eb->h_list;
400
401 i = ocfs2_search_for_hole_index(el, v_cluster); 388 i = ocfs2_search_for_hole_index(el, v_cluster);
402 } 389 }
403 390
@@ -630,7 +617,7 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
630 if (ret == 0) 617 if (ret == 0)
631 goto out; 618 goto out;
632 619
633 ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh); 620 ret = ocfs2_read_inode_block(inode, &di_bh);
634 if (ret) { 621 if (ret) {
635 mlog_errno(ret); 622 mlog_errno(ret);
636 goto out; 623 goto out;
@@ -819,3 +806,74 @@ out:
819 806
820 return ret; 807 return ret;
821} 808}
809
810int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
811 struct buffer_head *bhs[], int flags,
812 int (*validate)(struct super_block *sb,
813 struct buffer_head *bh))
814{
815 int rc = 0;
816 u64 p_block, p_count;
817 int i, count, done = 0;
818
819 mlog_entry("(inode = %p, v_block = %llu, nr = %d, bhs = %p, "
820 "flags = %x, validate = %p)\n",
821 inode, (unsigned long long)v_block, nr, bhs, flags,
822 validate);
823
824 if (((v_block + nr - 1) << inode->i_sb->s_blocksize_bits) >=
825 i_size_read(inode)) {
826 BUG_ON(!(flags & OCFS2_BH_READAHEAD));
827 goto out;
828 }
829
830 while (done < nr) {
831 down_read(&OCFS2_I(inode)->ip_alloc_sem);
832 rc = ocfs2_extent_map_get_blocks(inode, v_block + done,
833 &p_block, &p_count, NULL);
834 up_read(&OCFS2_I(inode)->ip_alloc_sem);
835 if (rc) {
836 mlog_errno(rc);
837 break;
838 }
839
840 if (!p_block) {
841 rc = -EIO;
842 mlog(ML_ERROR,
843 "Inode #%llu contains a hole at offset %llu\n",
844 (unsigned long long)OCFS2_I(inode)->ip_blkno,
845 (unsigned long long)(v_block + done) <<
846 inode->i_sb->s_blocksize_bits);
847 break;
848 }
849
850 count = nr - done;
851 if (p_count < count)
852 count = p_count;
853
854 /*
855 * If the caller passed us bhs, they should have come
856 * from a previous readahead call to this function. Thus,
857 * they should have the right b_blocknr.
858 */
859 for (i = 0; i < count; i++) {
860 if (!bhs[done + i])
861 continue;
862 BUG_ON(bhs[done + i]->b_blocknr != (p_block + i));
863 }
864
865 rc = ocfs2_read_blocks(inode, p_block, count, bhs + done,
866 flags, validate);
867 if (rc) {
868 mlog_errno(rc);
869 break;
870 }
871 done += count;
872 }
873
874out:
875 mlog_exit(rc);
876 return rc;
877}
878
879
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
index 1c4aa8b06f34..b7dd9731b462 100644
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
@@ -57,4 +57,28 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
57 u32 *p_cluster, u32 *num_clusters, 57 u32 *p_cluster, u32 *num_clusters,
58 struct ocfs2_extent_list *el); 58 struct ocfs2_extent_list *el);
59 59
60int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
61 struct buffer_head *bhs[], int flags,
62 int (*validate)(struct super_block *sb,
63 struct buffer_head *bh));
64static inline int ocfs2_read_virt_block(struct inode *inode, u64 v_block,
65 struct buffer_head **bh,
66 int (*validate)(struct super_block *sb,
67 struct buffer_head *bh))
68{
69 int status = 0;
70
71 if (bh == NULL) {
72 printk("ocfs2: bh == NULL\n");
73 status = -EINVAL;
74 goto bail;
75 }
76
77 status = ocfs2_read_virt_blocks(inode, v_block, 1, bh, 0, validate);
78
79bail:
80 return status;
81}
82
83
60#endif /* _EXTENT_MAP_H */ 84#endif /* _EXTENT_MAP_H */
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index e2570a3bc2b2..a5887df2cd8a 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -35,6 +35,7 @@
35#include <linux/mount.h> 35#include <linux/mount.h>
36#include <linux/writeback.h> 36#include <linux/writeback.h>
37#include <linux/falloc.h> 37#include <linux/falloc.h>
38#include <linux/quotaops.h>
38 39
39#define MLOG_MASK_PREFIX ML_INODE 40#define MLOG_MASK_PREFIX ML_INODE
40#include <cluster/masklog.h> 41#include <cluster/masklog.h>
@@ -56,6 +57,8 @@
56#include "suballoc.h" 57#include "suballoc.h"
57#include "super.h" 58#include "super.h"
58#include "xattr.h" 59#include "xattr.h"
60#include "acl.h"
61#include "quota.h"
59 62
60#include "buffer_head_io.h" 63#include "buffer_head_io.h"
61 64
@@ -253,8 +256,8 @@ int ocfs2_update_inode_atime(struct inode *inode,
253 goto out; 256 goto out;
254 } 257 }
255 258
256 ret = ocfs2_journal_access(handle, inode, bh, 259 ret = ocfs2_journal_access_di(handle, inode, bh,
257 OCFS2_JOURNAL_ACCESS_WRITE); 260 OCFS2_JOURNAL_ACCESS_WRITE);
258 if (ret) { 261 if (ret) {
259 mlog_errno(ret); 262 mlog_errno(ret);
260 goto out_commit; 263 goto out_commit;
@@ -303,9 +306,9 @@ bail:
303 return status; 306 return status;
304} 307}
305 308
306static int ocfs2_simple_size_update(struct inode *inode, 309int ocfs2_simple_size_update(struct inode *inode,
307 struct buffer_head *di_bh, 310 struct buffer_head *di_bh,
308 u64 new_i_size) 311 u64 new_i_size)
309{ 312{
310 int ret; 313 int ret;
311 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 314 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -350,8 +353,8 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
350 goto out; 353 goto out;
351 } 354 }
352 355
353 status = ocfs2_journal_access(handle, inode, fe_bh, 356 status = ocfs2_journal_access_di(handle, inode, fe_bh,
354 OCFS2_JOURNAL_ACCESS_WRITE); 357 OCFS2_JOURNAL_ACCESS_WRITE);
355 if (status < 0) { 358 if (status < 0) {
356 mlog_errno(status); 359 mlog_errno(status);
357 goto out_commit; 360 goto out_commit;
@@ -401,12 +404,9 @@ static int ocfs2_truncate_file(struct inode *inode,
401 (unsigned long long)OCFS2_I(inode)->ip_blkno, 404 (unsigned long long)OCFS2_I(inode)->ip_blkno,
402 (unsigned long long)new_i_size); 405 (unsigned long long)new_i_size);
403 406
407 /* We trust di_bh because it comes from ocfs2_inode_lock(), which
408 * already validated it */
404 fe = (struct ocfs2_dinode *) di_bh->b_data; 409 fe = (struct ocfs2_dinode *) di_bh->b_data;
405 if (!OCFS2_IS_VALID_DINODE(fe)) {
406 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
407 status = -EIO;
408 goto bail;
409 }
410 410
411 mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode), 411 mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
412 "Inode %llu, inode i_size = %lld != di " 412 "Inode %llu, inode i_size = %lld != di "
@@ -536,6 +536,7 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
536 enum ocfs2_alloc_restarted why; 536 enum ocfs2_alloc_restarted why;
537 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 537 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
538 struct ocfs2_extent_tree et; 538 struct ocfs2_extent_tree et;
539 int did_quota = 0;
539 540
540 mlog_entry("(clusters_to_add = %u)\n", clusters_to_add); 541 mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
541 542
@@ -545,18 +546,12 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
545 */ 546 */
546 BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb)); 547 BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));
547 548
548 status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh); 549 status = ocfs2_read_inode_block(inode, &bh);
549 if (status < 0) { 550 if (status < 0) {
550 mlog_errno(status); 551 mlog_errno(status);
551 goto leave; 552 goto leave;
552 } 553 }
553
554 fe = (struct ocfs2_dinode *) bh->b_data; 554 fe = (struct ocfs2_dinode *) bh->b_data;
555 if (!OCFS2_IS_VALID_DINODE(fe)) {
556 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
557 status = -EIO;
558 goto leave;
559 }
560 555
561restart_all: 556restart_all:
562 BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters); 557 BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
@@ -585,11 +580,18 @@ restart_all:
585 } 580 }
586 581
587restarted_transaction: 582restarted_transaction:
583 if (vfs_dq_alloc_space_nodirty(inode, ocfs2_clusters_to_bytes(osb->sb,
584 clusters_to_add))) {
585 status = -EDQUOT;
586 goto leave;
587 }
588 did_quota = 1;
589
588 /* reserve a write to the file entry early on - that we if we 590 /* reserve a write to the file entry early on - that we if we
589 * run out of credits in the allocation path, we can still 591 * run out of credits in the allocation path, we can still
590 * update i_size. */ 592 * update i_size. */
591 status = ocfs2_journal_access(handle, inode, bh, 593 status = ocfs2_journal_access_di(handle, inode, bh,
592 OCFS2_JOURNAL_ACCESS_WRITE); 594 OCFS2_JOURNAL_ACCESS_WRITE);
593 if (status < 0) { 595 if (status < 0) {
594 mlog_errno(status); 596 mlog_errno(status);
595 goto leave; 597 goto leave;
@@ -622,6 +624,10 @@ restarted_transaction:
622 spin_lock(&OCFS2_I(inode)->ip_lock); 624 spin_lock(&OCFS2_I(inode)->ip_lock);
623 clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters); 625 clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
624 spin_unlock(&OCFS2_I(inode)->ip_lock); 626 spin_unlock(&OCFS2_I(inode)->ip_lock);
627 /* Release unused quota reservation */
628 vfs_dq_free_space(inode,
629 ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
630 did_quota = 0;
625 631
626 if (why != RESTART_NONE && clusters_to_add) { 632 if (why != RESTART_NONE && clusters_to_add) {
627 if (why == RESTART_META) { 633 if (why == RESTART_META) {
@@ -654,6 +660,9 @@ restarted_transaction:
654 OCFS2_I(inode)->ip_clusters, (long long)i_size_read(inode)); 660 OCFS2_I(inode)->ip_clusters, (long long)i_size_read(inode));
655 661
656leave: 662leave:
663 if (status < 0 && did_quota)
664 vfs_dq_free_space(inode,
665 ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
657 if (handle) { 666 if (handle) {
658 ocfs2_commit_trans(osb, handle); 667 ocfs2_commit_trans(osb, handle);
659 handle = NULL; 668 handle = NULL;
@@ -885,6 +894,9 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
885 struct ocfs2_super *osb = OCFS2_SB(sb); 894 struct ocfs2_super *osb = OCFS2_SB(sb);
886 struct buffer_head *bh = NULL; 895 struct buffer_head *bh = NULL;
887 handle_t *handle = NULL; 896 handle_t *handle = NULL;
897 int locked[MAXQUOTAS] = {0, 0};
898 int credits, qtype;
899 struct ocfs2_mem_dqinfo *oinfo;
888 900
889 mlog_entry("(0x%p, '%.*s')\n", dentry, 901 mlog_entry("(0x%p, '%.*s')\n", dentry,
890 dentry->d_name.len, dentry->d_name.name); 902 dentry->d_name.len, dentry->d_name.name);
@@ -955,11 +967,47 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
955 } 967 }
956 } 968 }
957 969
958 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 970 if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
959 if (IS_ERR(handle)) { 971 (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
960 status = PTR_ERR(handle); 972 credits = OCFS2_INODE_UPDATE_CREDITS;
961 mlog_errno(status); 973 if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid
962 goto bail_unlock; 974 && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
975 OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
976 oinfo = sb_dqinfo(sb, USRQUOTA)->dqi_priv;
977 status = ocfs2_lock_global_qf(oinfo, 1);
978 if (status < 0)
979 goto bail_unlock;
980 credits += ocfs2_calc_qinit_credits(sb, USRQUOTA) +
981 ocfs2_calc_qdel_credits(sb, USRQUOTA);
982 locked[USRQUOTA] = 1;
983 }
984 if (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid
985 && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
986 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
987 oinfo = sb_dqinfo(sb, GRPQUOTA)->dqi_priv;
988 status = ocfs2_lock_global_qf(oinfo, 1);
989 if (status < 0)
990 goto bail_unlock;
991 credits += ocfs2_calc_qinit_credits(sb, GRPQUOTA) +
992 ocfs2_calc_qdel_credits(sb, GRPQUOTA);
993 locked[GRPQUOTA] = 1;
994 }
995 handle = ocfs2_start_trans(osb, credits);
996 if (IS_ERR(handle)) {
997 status = PTR_ERR(handle);
998 mlog_errno(status);
999 goto bail_unlock;
1000 }
1001 status = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;
1002 if (status < 0)
1003 goto bail_commit;
1004 } else {
1005 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1006 if (IS_ERR(handle)) {
1007 status = PTR_ERR(handle);
1008 mlog_errno(status);
1009 goto bail_unlock;
1010 }
963 } 1011 }
964 1012
965 /* 1013 /*
@@ -982,6 +1030,12 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
982bail_commit: 1030bail_commit:
983 ocfs2_commit_trans(osb, handle); 1031 ocfs2_commit_trans(osb, handle);
984bail_unlock: 1032bail_unlock:
1033 for (qtype = 0; qtype < MAXQUOTAS; qtype++) {
1034 if (!locked[qtype])
1035 continue;
1036 oinfo = sb_dqinfo(sb, qtype)->dqi_priv;
1037 ocfs2_unlock_global_qf(oinfo, 1);
1038 }
985 ocfs2_inode_unlock(inode, 1); 1039 ocfs2_inode_unlock(inode, 1);
986bail_unlock_rw: 1040bail_unlock_rw:
987 if (size_change) 1041 if (size_change)
@@ -989,6 +1043,12 @@ bail_unlock_rw:
989bail: 1043bail:
990 brelse(bh); 1044 brelse(bh);
991 1045
1046 if (!status && attr->ia_valid & ATTR_MODE) {
1047 status = ocfs2_acl_chmod(inode);
1048 if (status < 0)
1049 mlog_errno(status);
1050 }
1051
992 mlog_exit(status); 1052 mlog_exit(status);
993 return status; 1053 return status;
994} 1054}
@@ -1035,7 +1095,7 @@ int ocfs2_permission(struct inode *inode, int mask)
1035 goto out; 1095 goto out;
1036 } 1096 }
1037 1097
1038 ret = generic_permission(inode, mask, NULL); 1098 ret = generic_permission(inode, mask, ocfs2_check_acl);
1039 1099
1040 ocfs2_inode_unlock(inode, 0); 1100 ocfs2_inode_unlock(inode, 0);
1041out: 1101out:
@@ -1061,8 +1121,8 @@ static int __ocfs2_write_remove_suid(struct inode *inode,
1061 goto out; 1121 goto out;
1062 } 1122 }
1063 1123
1064 ret = ocfs2_journal_access(handle, inode, bh, 1124 ret = ocfs2_journal_access_di(handle, inode, bh,
1065 OCFS2_JOURNAL_ACCESS_WRITE); 1125 OCFS2_JOURNAL_ACCESS_WRITE);
1066 if (ret < 0) { 1126 if (ret < 0) {
1067 mlog_errno(ret); 1127 mlog_errno(ret);
1068 goto out_trans; 1128 goto out_trans;
@@ -1128,9 +1188,8 @@ static int ocfs2_write_remove_suid(struct inode *inode)
1128{ 1188{
1129 int ret; 1189 int ret;
1130 struct buffer_head *bh = NULL; 1190 struct buffer_head *bh = NULL;
1131 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1132 1191
1133 ret = ocfs2_read_block(inode, oi->ip_blkno, &bh); 1192 ret = ocfs2_read_inode_block(inode, &bh);
1134 if (ret < 0) { 1193 if (ret < 0) {
1135 mlog_errno(ret); 1194 mlog_errno(ret);
1136 goto out; 1195 goto out;
@@ -1156,8 +1215,7 @@ static int ocfs2_allocate_unwritten_extents(struct inode *inode,
1156 struct buffer_head *di_bh = NULL; 1215 struct buffer_head *di_bh = NULL;
1157 1216
1158 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 1217 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1159 ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, 1218 ret = ocfs2_read_inode_block(inode, &di_bh);
1160 &di_bh);
1161 if (ret) { 1219 if (ret) {
1162 mlog_errno(ret); 1220 mlog_errno(ret);
1163 goto out; 1221 goto out;
@@ -1226,83 +1284,6 @@ out:
1226 return ret; 1284 return ret;
1227} 1285}
1228 1286
1229static int __ocfs2_remove_inode_range(struct inode *inode,
1230 struct buffer_head *di_bh,
1231 u32 cpos, u32 phys_cpos, u32 len,
1232 struct ocfs2_cached_dealloc_ctxt *dealloc)
1233{
1234 int ret;
1235 u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
1236 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1237 struct inode *tl_inode = osb->osb_tl_inode;
1238 handle_t *handle;
1239 struct ocfs2_alloc_context *meta_ac = NULL;
1240 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
1241 struct ocfs2_extent_tree et;
1242
1243 ocfs2_init_dinode_extent_tree(&et, inode, di_bh);
1244
1245 ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac);
1246 if (ret) {
1247 mlog_errno(ret);
1248 return ret;
1249 }
1250
1251 mutex_lock(&tl_inode->i_mutex);
1252
1253 if (ocfs2_truncate_log_needs_flush(osb)) {
1254 ret = __ocfs2_flush_truncate_log(osb);
1255 if (ret < 0) {
1256 mlog_errno(ret);
1257 goto out;
1258 }
1259 }
1260
1261 handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
1262 if (IS_ERR(handle)) {
1263 ret = PTR_ERR(handle);
1264 mlog_errno(ret);
1265 goto out;
1266 }
1267
1268 ret = ocfs2_journal_access(handle, inode, di_bh,
1269 OCFS2_JOURNAL_ACCESS_WRITE);
1270 if (ret) {
1271 mlog_errno(ret);
1272 goto out;
1273 }
1274
1275 ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac,
1276 dealloc);
1277 if (ret) {
1278 mlog_errno(ret);
1279 goto out_commit;
1280 }
1281
1282 OCFS2_I(inode)->ip_clusters -= len;
1283 di->i_clusters = cpu_to_le32(OCFS2_I(inode)->ip_clusters);
1284
1285 ret = ocfs2_journal_dirty(handle, di_bh);
1286 if (ret) {
1287 mlog_errno(ret);
1288 goto out_commit;
1289 }
1290
1291 ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
1292 if (ret)
1293 mlog_errno(ret);
1294
1295out_commit:
1296 ocfs2_commit_trans(osb, handle);
1297out:
1298 mutex_unlock(&tl_inode->i_mutex);
1299
1300 if (meta_ac)
1301 ocfs2_free_alloc_context(meta_ac);
1302
1303 return ret;
1304}
1305
1306/* 1287/*
1307 * Truncate a byte range, avoiding pages within partial clusters. This 1288 * Truncate a byte range, avoiding pages within partial clusters. This
1308 * preserves those pages for the zeroing code to write to. 1289 * preserves those pages for the zeroing code to write to.
@@ -1402,7 +1383,9 @@ static int ocfs2_remove_inode_range(struct inode *inode,
1402 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1383 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1403 struct ocfs2_cached_dealloc_ctxt dealloc; 1384 struct ocfs2_cached_dealloc_ctxt dealloc;
1404 struct address_space *mapping = inode->i_mapping; 1385 struct address_space *mapping = inode->i_mapping;
1386 struct ocfs2_extent_tree et;
1405 1387
1388 ocfs2_init_dinode_extent_tree(&et, inode, di_bh);
1406 ocfs2_init_dealloc_ctxt(&dealloc); 1389 ocfs2_init_dealloc_ctxt(&dealloc);
1407 1390
1408 if (byte_len == 0) 1391 if (byte_len == 0)
@@ -1458,9 +1441,9 @@ static int ocfs2_remove_inode_range(struct inode *inode,
1458 1441
1459 /* Only do work for non-holes */ 1442 /* Only do work for non-holes */
1460 if (phys_cpos != 0) { 1443 if (phys_cpos != 0) {
1461 ret = __ocfs2_remove_inode_range(inode, di_bh, cpos, 1444 ret = ocfs2_remove_btree_range(inode, &et, cpos,
1462 phys_cpos, alloc_size, 1445 phys_cpos, alloc_size,
1463 &dealloc); 1446 &dealloc);
1464 if (ret) { 1447 if (ret) {
1465 mlog_errno(ret); 1448 mlog_errno(ret);
1466 goto out; 1449 goto out;
@@ -1622,7 +1605,7 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd,
1622 struct ocfs2_space_resv *sr) 1605 struct ocfs2_space_resv *sr)
1623{ 1606{
1624 struct inode *inode = file->f_path.dentry->d_inode; 1607 struct inode *inode = file->f_path.dentry->d_inode;
1625 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);; 1608 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1626 1609
1627 if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) && 1610 if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) &&
1628 !ocfs2_writes_unwritten_extents(osb)) 1611 !ocfs2_writes_unwritten_extents(osb))
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index e92382cbca5f..172f9fbc9fc7 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -51,6 +51,9 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb,
51 struct ocfs2_alloc_context *data_ac, 51 struct ocfs2_alloc_context *data_ac,
52 struct ocfs2_alloc_context *meta_ac, 52 struct ocfs2_alloc_context *meta_ac,
53 enum ocfs2_alloc_restarted *reason_ret); 53 enum ocfs2_alloc_restarted *reason_ret);
54int ocfs2_simple_size_update(struct inode *inode,
55 struct buffer_head *di_bh,
56 u64 new_i_size);
54int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, 57int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size,
55 u64 zero_to); 58 u64 zero_to);
56int ocfs2_setattr(struct dentry *dentry, struct iattr *attr); 59int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 7aa00d511874..229e707bc050 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -28,6 +28,7 @@
28#include <linux/slab.h> 28#include <linux/slab.h>
29#include <linux/highmem.h> 29#include <linux/highmem.h>
30#include <linux/pagemap.h> 30#include <linux/pagemap.h>
31#include <linux/quotaops.h>
31 32
32#include <asm/byteorder.h> 33#include <asm/byteorder.h>
33 34
@@ -37,6 +38,7 @@
37#include "ocfs2.h" 38#include "ocfs2.h"
38 39
39#include "alloc.h" 40#include "alloc.h"
41#include "blockcheck.h"
40#include "dlmglue.h" 42#include "dlmglue.h"
41#include "extent_map.h" 43#include "extent_map.h"
42#include "file.h" 44#include "file.h"
@@ -214,12 +216,11 @@ static int ocfs2_init_locked_inode(struct inode *inode, void *opaque)
214 return 0; 216 return 0;
215} 217}
216 218
217int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, 219void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
218 int create_ino) 220 int create_ino)
219{ 221{
220 struct super_block *sb; 222 struct super_block *sb;
221 struct ocfs2_super *osb; 223 struct ocfs2_super *osb;
222 int status = -EINVAL;
223 int use_plocks = 1; 224 int use_plocks = 1;
224 225
225 mlog_entry("(0x%p, size:%llu)\n", inode, 226 mlog_entry("(0x%p, size:%llu)\n", inode,
@@ -232,25 +233,17 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
232 ocfs2_mount_local(osb) || !ocfs2_stack_supports_plocks()) 233 ocfs2_mount_local(osb) || !ocfs2_stack_supports_plocks())
233 use_plocks = 0; 234 use_plocks = 0;
234 235
235 /* this means that read_inode cannot create a superblock inode 236 /*
236 * today. change if needed. */ 237 * These have all been checked by ocfs2_read_inode_block() or set
237 if (!OCFS2_IS_VALID_DINODE(fe) || 238 * by ocfs2_mknod_locked(), so a failure is a code bug.
238 !(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL))) { 239 */
239 mlog(0, "Invalid dinode: i_ino=%lu, i_blkno=%llu, " 240 BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); /* This means that read_inode
240 "signature = %.*s, flags = 0x%x\n", 241 cannot create a superblock
241 inode->i_ino, 242 inode today. change if
242 (unsigned long long)le64_to_cpu(fe->i_blkno), 7, 243 that is needed. */
243 fe->i_signature, le32_to_cpu(fe->i_flags)); 244 BUG_ON(!(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL)));
244 goto bail; 245 BUG_ON(le32_to_cpu(fe->i_fs_generation) != osb->fs_generation);
245 }
246 246
247 if (le32_to_cpu(fe->i_fs_generation) != osb->fs_generation) {
248 mlog(ML_ERROR, "file entry generation does not match "
249 "superblock! osb->fs_generation=%x, "
250 "fe->i_fs_generation=%x\n",
251 osb->fs_generation, le32_to_cpu(fe->i_fs_generation));
252 goto bail;
253 }
254 247
255 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); 248 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
256 OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr); 249 OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr);
@@ -284,14 +277,18 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
284 277
285 inode->i_nlink = le16_to_cpu(fe->i_links_count); 278 inode->i_nlink = le16_to_cpu(fe->i_links_count);
286 279
287 if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) 280 if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) {
288 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE; 281 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE;
282 inode->i_flags |= S_NOQUOTA;
283 }
289 284
290 if (fe->i_flags & cpu_to_le32(OCFS2_LOCAL_ALLOC_FL)) { 285 if (fe->i_flags & cpu_to_le32(OCFS2_LOCAL_ALLOC_FL)) {
291 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP; 286 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP;
292 mlog(0, "local alloc inode: i_ino=%lu\n", inode->i_ino); 287 mlog(0, "local alloc inode: i_ino=%lu\n", inode->i_ino);
293 } else if (fe->i_flags & cpu_to_le32(OCFS2_BITMAP_FL)) { 288 } else if (fe->i_flags & cpu_to_le32(OCFS2_BITMAP_FL)) {
294 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP; 289 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP;
290 } else if (fe->i_flags & cpu_to_le32(OCFS2_QUOTA_FL)) {
291 inode->i_flags |= S_NOQUOTA;
295 } else if (fe->i_flags & cpu_to_le32(OCFS2_SUPER_BLOCK_FL)) { 292 } else if (fe->i_flags & cpu_to_le32(OCFS2_SUPER_BLOCK_FL)) {
296 mlog(0, "superblock inode: i_ino=%lu\n", inode->i_ino); 293 mlog(0, "superblock inode: i_ino=%lu\n", inode->i_ino);
297 /* we can't actually hit this as read_inode can't 294 /* we can't actually hit this as read_inode can't
@@ -354,10 +351,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
354 351
355 ocfs2_set_inode_flags(inode); 352 ocfs2_set_inode_flags(inode);
356 353
357 status = 0; 354 mlog_exit_void();
358bail:
359 mlog_exit(status);
360 return status;
361} 355}
362 356
363static int ocfs2_read_locked_inode(struct inode *inode, 357static int ocfs2_read_locked_inode(struct inode *inode,
@@ -460,11 +454,14 @@ static int ocfs2_read_locked_inode(struct inode *inode,
460 } 454 }
461 } 455 }
462 456
463 if (can_lock) 457 if (can_lock) {
464 status = ocfs2_read_blocks(inode, args->fi_blkno, 1, &bh, 458 status = ocfs2_read_inode_block_full(inode, &bh,
465 OCFS2_BH_IGNORE_CACHE); 459 OCFS2_BH_IGNORE_CACHE);
466 else 460 } else {
467 status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh); 461 status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh);
462 if (!status)
463 status = ocfs2_validate_inode_block(osb->sb, bh);
464 }
468 if (status < 0) { 465 if (status < 0) {
469 mlog_errno(status); 466 mlog_errno(status);
470 goto bail; 467 goto bail;
@@ -472,12 +469,6 @@ static int ocfs2_read_locked_inode(struct inode *inode,
472 469
473 status = -EINVAL; 470 status = -EINVAL;
474 fe = (struct ocfs2_dinode *) bh->b_data; 471 fe = (struct ocfs2_dinode *) bh->b_data;
475 if (!OCFS2_IS_VALID_DINODE(fe)) {
476 mlog(0, "Invalid dinode #%llu: signature = %.*s\n",
477 (unsigned long long)args->fi_blkno, 7,
478 fe->i_signature);
479 goto bail;
480 }
481 472
482 /* 473 /*
483 * This is a code bug. Right now the caller needs to 474 * This is a code bug. Right now the caller needs to
@@ -491,10 +482,9 @@ static int ocfs2_read_locked_inode(struct inode *inode,
491 482
492 if (S_ISCHR(le16_to_cpu(fe->i_mode)) || 483 if (S_ISCHR(le16_to_cpu(fe->i_mode)) ||
493 S_ISBLK(le16_to_cpu(fe->i_mode))) 484 S_ISBLK(le16_to_cpu(fe->i_mode)))
494 inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev)); 485 inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));
495 486
496 if (ocfs2_populate_inode(inode, fe, 0) < 0) 487 ocfs2_populate_inode(inode, fe, 0);
497 goto bail;
498 488
499 BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno)); 489 BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno));
500 490
@@ -547,8 +537,8 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
547 goto out; 537 goto out;
548 } 538 }
549 539
550 status = ocfs2_journal_access(handle, inode, fe_bh, 540 status = ocfs2_journal_access_di(handle, inode, fe_bh,
551 OCFS2_JOURNAL_ACCESS_WRITE); 541 OCFS2_JOURNAL_ACCESS_WRITE);
552 if (status < 0) { 542 if (status < 0) {
553 mlog_errno(status); 543 mlog_errno(status);
554 goto out; 544 goto out;
@@ -615,7 +605,8 @@ static int ocfs2_remove_inode(struct inode *inode,
615 goto bail; 605 goto bail;
616 } 606 }
617 607
618 handle = ocfs2_start_trans(osb, OCFS2_DELETE_INODE_CREDITS); 608 handle = ocfs2_start_trans(osb, OCFS2_DELETE_INODE_CREDITS +
609 ocfs2_quota_trans_credits(inode->i_sb));
619 if (IS_ERR(handle)) { 610 if (IS_ERR(handle)) {
620 status = PTR_ERR(handle); 611 status = PTR_ERR(handle);
621 mlog_errno(status); 612 mlog_errno(status);
@@ -630,8 +621,8 @@ static int ocfs2_remove_inode(struct inode *inode,
630 } 621 }
631 622
632 /* set the inodes dtime */ 623 /* set the inodes dtime */
633 status = ocfs2_journal_access(handle, inode, di_bh, 624 status = ocfs2_journal_access_di(handle, inode, di_bh,
634 OCFS2_JOURNAL_ACCESS_WRITE); 625 OCFS2_JOURNAL_ACCESS_WRITE);
635 if (status < 0) { 626 if (status < 0) {
636 mlog_errno(status); 627 mlog_errno(status);
637 goto bail_commit; 628 goto bail_commit;
@@ -647,6 +638,7 @@ static int ocfs2_remove_inode(struct inode *inode,
647 } 638 }
648 639
649 ocfs2_remove_from_cache(inode, di_bh); 640 ocfs2_remove_from_cache(inode, di_bh);
641 vfs_dq_free_inode(inode);
650 642
651 status = ocfs2_free_dinode(handle, inode_alloc_inode, 643 status = ocfs2_free_dinode(handle, inode_alloc_inode,
652 inode_alloc_bh, di); 644 inode_alloc_bh, di);
@@ -929,7 +921,10 @@ void ocfs2_delete_inode(struct inode *inode)
929 921
930 mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino); 922 mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
931 923
932 if (is_bad_inode(inode)) { 924 /* When we fail in read_inode() we mark inode as bad. The second test
925 * catches the case when inode allocation fails before allocating
926 * a block for inode. */
927 if (is_bad_inode(inode) || !OCFS2_I(inode)->ip_blkno) {
933 mlog(0, "Skipping delete of bad inode\n"); 928 mlog(0, "Skipping delete of bad inode\n");
934 goto bail; 929 goto bail;
935 } 930 }
@@ -1195,8 +1190,8 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
1195 mlog_entry("(inode %llu)\n", 1190 mlog_entry("(inode %llu)\n",
1196 (unsigned long long)OCFS2_I(inode)->ip_blkno); 1191 (unsigned long long)OCFS2_I(inode)->ip_blkno);
1197 1192
1198 status = ocfs2_journal_access(handle, inode, bh, 1193 status = ocfs2_journal_access_di(handle, inode, bh,
1199 OCFS2_JOURNAL_ACCESS_WRITE); 1194 OCFS2_JOURNAL_ACCESS_WRITE);
1200 if (status < 0) { 1195 if (status < 0) {
1201 mlog_errno(status); 1196 mlog_errno(status);
1202 goto leave; 1197 goto leave;
@@ -1264,3 +1259,89 @@ void ocfs2_refresh_inode(struct inode *inode,
1264 1259
1265 spin_unlock(&OCFS2_I(inode)->ip_lock); 1260 spin_unlock(&OCFS2_I(inode)->ip_lock);
1266} 1261}
1262
1263int ocfs2_validate_inode_block(struct super_block *sb,
1264 struct buffer_head *bh)
1265{
1266 int rc;
1267 struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
1268
1269 mlog(0, "Validating dinode %llu\n",
1270 (unsigned long long)bh->b_blocknr);
1271
1272 BUG_ON(!buffer_uptodate(bh));
1273
1274 /*
1275 * If the ecc fails, we return the error but otherwise
1276 * leave the filesystem running. We know any error is
1277 * local to this block.
1278 */
1279 rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &di->i_check);
1280 if (rc) {
1281 mlog(ML_ERROR, "Checksum failed for dinode %llu\n",
1282 (unsigned long long)bh->b_blocknr);
1283 goto bail;
1284 }
1285
1286 /*
1287 * Errors after here are fatal.
1288 */
1289
1290 rc = -EINVAL;
1291
1292 if (!OCFS2_IS_VALID_DINODE(di)) {
1293 ocfs2_error(sb, "Invalid dinode #%llu: signature = %.*s\n",
1294 (unsigned long long)bh->b_blocknr, 7,
1295 di->i_signature);
1296 goto bail;
1297 }
1298
1299 if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) {
1300 ocfs2_error(sb, "Invalid dinode #%llu: i_blkno is %llu\n",
1301 (unsigned long long)bh->b_blocknr,
1302 (unsigned long long)le64_to_cpu(di->i_blkno));
1303 goto bail;
1304 }
1305
1306 if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
1307 ocfs2_error(sb,
1308 "Invalid dinode #%llu: OCFS2_VALID_FL not set\n",
1309 (unsigned long long)bh->b_blocknr);
1310 goto bail;
1311 }
1312
1313 if (le32_to_cpu(di->i_fs_generation) !=
1314 OCFS2_SB(sb)->fs_generation) {
1315 ocfs2_error(sb,
1316 "Invalid dinode #%llu: fs_generation is %u\n",
1317 (unsigned long long)bh->b_blocknr,
1318 le32_to_cpu(di->i_fs_generation));
1319 goto bail;
1320 }
1321
1322 rc = 0;
1323
1324bail:
1325 return rc;
1326}
1327
1328int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh,
1329 int flags)
1330{
1331 int rc;
1332 struct buffer_head *tmp = *bh;
1333
1334 rc = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, &tmp,
1335 flags, ocfs2_validate_inode_block);
1336
1337 /* If ocfs2_read_blocks() got us a new bh, pass it up. */
1338 if (!rc && !*bh)
1339 *bh = tmp;
1340
1341 return rc;
1342}
1343
1344int ocfs2_read_inode_block(struct inode *inode, struct buffer_head **bh)
1345{
1346 return ocfs2_read_inode_block_full(inode, bh, 0);
1347}
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 2f37af9bcc4a..eb3c302b38d3 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -128,8 +128,8 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags,
128 int sysfile_type); 128 int sysfile_type);
129int ocfs2_inode_init_private(struct inode *inode); 129int ocfs2_inode_init_private(struct inode *inode);
130int ocfs2_inode_revalidate(struct dentry *dentry); 130int ocfs2_inode_revalidate(struct dentry *dentry);
131int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, 131void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
132 int create_ino); 132 int create_ino);
133void ocfs2_read_inode(struct inode *inode); 133void ocfs2_read_inode(struct inode *inode);
134void ocfs2_read_inode2(struct inode *inode, void *opaque); 134void ocfs2_read_inode2(struct inode *inode, void *opaque);
135ssize_t ocfs2_rw_direct(int rw, struct file *filp, char *buf, 135ssize_t ocfs2_rw_direct(int rw, struct file *filp, char *buf,
@@ -142,6 +142,8 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
142 struct buffer_head *bh); 142 struct buffer_head *bh);
143int ocfs2_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb); 143int ocfs2_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb);
144int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb); 144int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb);
145struct buffer_head *ocfs2_bread(struct inode *inode,
146 int block, int *err, int reada);
145 147
146void ocfs2_set_inode_flags(struct inode *inode); 148void ocfs2_set_inode_flags(struct inode *inode);
147void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi); 149void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi);
@@ -153,4 +155,16 @@ static inline blkcnt_t ocfs2_inode_sector_count(struct inode *inode)
153 return (blkcnt_t)(OCFS2_I(inode)->ip_clusters << c_to_s_bits); 155 return (blkcnt_t)(OCFS2_I(inode)->ip_clusters << c_to_s_bits);
154} 156}
155 157
158/* Validate that a bh contains a valid inode */
159int ocfs2_validate_inode_block(struct super_block *sb,
160 struct buffer_head *bh);
161/*
162 * Read an inode block into *bh. If *bh is NULL, a bh will be allocated.
163 * This is a cached read. The inode will be validated with
164 * ocfs2_validate_inode_block().
165 */
166int ocfs2_read_inode_block(struct inode *inode, struct buffer_head **bh);
167/* The same, but can be passed OCFS2_BH_* flags */
168int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh,
169 int flags);
156#endif /* OCFS2_INODE_H */ 170#endif /* OCFS2_INODE_H */
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 99fe9d584f3c..57d7d25a2b9a 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -35,6 +35,7 @@
35#include "ocfs2.h" 35#include "ocfs2.h"
36 36
37#include "alloc.h" 37#include "alloc.h"
38#include "blockcheck.h"
38#include "dir.h" 39#include "dir.h"
39#include "dlmglue.h" 40#include "dlmglue.h"
40#include "extent_map.h" 41#include "extent_map.h"
@@ -45,6 +46,7 @@
45#include "slot_map.h" 46#include "slot_map.h"
46#include "super.h" 47#include "super.h"
47#include "sysfile.h" 48#include "sysfile.h"
49#include "quota.h"
48 50
49#include "buffer_head_io.h" 51#include "buffer_head_io.h"
50 52
@@ -52,10 +54,10 @@ DEFINE_SPINLOCK(trans_inc_lock);
52 54
53static int ocfs2_force_read_journal(struct inode *inode); 55static int ocfs2_force_read_journal(struct inode *inode);
54static int ocfs2_recover_node(struct ocfs2_super *osb, 56static int ocfs2_recover_node(struct ocfs2_super *osb,
55 int node_num); 57 int node_num, int slot_num);
56static int __ocfs2_recovery_thread(void *arg); 58static int __ocfs2_recovery_thread(void *arg);
57static int ocfs2_commit_cache(struct ocfs2_super *osb); 59static int ocfs2_commit_cache(struct ocfs2_super *osb);
58static int ocfs2_wait_on_mount(struct ocfs2_super *osb); 60static int __ocfs2_wait_on_mount(struct ocfs2_super *osb, int quota);
59static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb, 61static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
60 int dirty, int replayed); 62 int dirty, int replayed);
61static int ocfs2_trylock_journal(struct ocfs2_super *osb, 63static int ocfs2_trylock_journal(struct ocfs2_super *osb,
@@ -64,6 +66,17 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
64 int slot); 66 int slot);
65static int ocfs2_commit_thread(void *arg); 67static int ocfs2_commit_thread(void *arg);
66 68
69static inline int ocfs2_wait_on_mount(struct ocfs2_super *osb)
70{
71 return __ocfs2_wait_on_mount(osb, 0);
72}
73
74static inline int ocfs2_wait_on_quotas(struct ocfs2_super *osb)
75{
76 return __ocfs2_wait_on_mount(osb, 1);
77}
78
79
67 80
68/* 81/*
69 * The recovery_list is a simple linked list of node numbers to recover. 82 * The recovery_list is a simple linked list of node numbers to recover.
@@ -256,11 +269,9 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
256 BUG_ON(osb->journal->j_state == OCFS2_JOURNAL_FREE); 269 BUG_ON(osb->journal->j_state == OCFS2_JOURNAL_FREE);
257 BUG_ON(max_buffs <= 0); 270 BUG_ON(max_buffs <= 0);
258 271
259 /* JBD might support this, but our journalling code doesn't yet. */ 272 /* Nested transaction? Just return the handle... */
260 if (journal_current_handle()) { 273 if (journal_current_handle())
261 mlog(ML_ERROR, "Recursive transaction attempted!\n"); 274 return jbd2_journal_start(journal, max_buffs);
262 BUG();
263 }
264 275
265 down_read(&osb->journal->j_trans_barrier); 276 down_read(&osb->journal->j_trans_barrier);
266 277
@@ -285,16 +296,18 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
285int ocfs2_commit_trans(struct ocfs2_super *osb, 296int ocfs2_commit_trans(struct ocfs2_super *osb,
286 handle_t *handle) 297 handle_t *handle)
287{ 298{
288 int ret; 299 int ret, nested;
289 struct ocfs2_journal *journal = osb->journal; 300 struct ocfs2_journal *journal = osb->journal;
290 301
291 BUG_ON(!handle); 302 BUG_ON(!handle);
292 303
304 nested = handle->h_ref > 1;
293 ret = jbd2_journal_stop(handle); 305 ret = jbd2_journal_stop(handle);
294 if (ret < 0) 306 if (ret < 0)
295 mlog_errno(ret); 307 mlog_errno(ret);
296 308
297 up_read(&journal->j_trans_barrier); 309 if (!nested)
310 up_read(&journal->j_trans_barrier);
298 311
299 return ret; 312 return ret;
300} 313}
@@ -357,10 +370,137 @@ bail:
357 return status; 370 return status;
358} 371}
359 372
360int ocfs2_journal_access(handle_t *handle, 373struct ocfs2_triggers {
361 struct inode *inode, 374 struct jbd2_buffer_trigger_type ot_triggers;
362 struct buffer_head *bh, 375 int ot_offset;
363 int type) 376};
377
378static inline struct ocfs2_triggers *to_ocfs2_trigger(struct jbd2_buffer_trigger_type *triggers)
379{
380 return container_of(triggers, struct ocfs2_triggers, ot_triggers);
381}
382
383static void ocfs2_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
384 struct buffer_head *bh,
385 void *data, size_t size)
386{
387 struct ocfs2_triggers *ot = to_ocfs2_trigger(triggers);
388
389 /*
390 * We aren't guaranteed to have the superblock here, so we
391 * must unconditionally compute the ecc data.
392 * __ocfs2_journal_access() will only set the triggers if
393 * metaecc is enabled.
394 */
395 ocfs2_block_check_compute(data, size, data + ot->ot_offset);
396}
397
398/*
399 * Quota blocks have their own trigger because the struct ocfs2_block_check
400 * offset depends on the blocksize.
401 */
402static void ocfs2_dq_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
403 struct buffer_head *bh,
404 void *data, size_t size)
405{
406 struct ocfs2_disk_dqtrailer *dqt =
407 ocfs2_block_dqtrailer(size, data);
408
409 /*
410 * We aren't guaranteed to have the superblock here, so we
411 * must unconditionally compute the ecc data.
412 * __ocfs2_journal_access() will only set the triggers if
413 * metaecc is enabled.
414 */
415 ocfs2_block_check_compute(data, size, &dqt->dq_check);
416}
417
418/*
419 * Directory blocks also have their own trigger because the
420 * struct ocfs2_block_check offset depends on the blocksize.
421 */
422static void ocfs2_db_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
423 struct buffer_head *bh,
424 void *data, size_t size)
425{
426 struct ocfs2_dir_block_trailer *trailer =
427 ocfs2_dir_trailer_from_size(size, data);
428
429 /*
430 * We aren't guaranteed to have the superblock here, so we
431 * must unconditionally compute the ecc data.
432 * __ocfs2_journal_access() will only set the triggers if
433 * metaecc is enabled.
434 */
435 ocfs2_block_check_compute(data, size, &trailer->db_check);
436}
437
438static void ocfs2_abort_trigger(struct jbd2_buffer_trigger_type *triggers,
439 struct buffer_head *bh)
440{
441 mlog(ML_ERROR,
442 "ocfs2_abort_trigger called by JBD2. bh = 0x%lx, "
443 "bh->b_blocknr = %llu\n",
444 (unsigned long)bh,
445 (unsigned long long)bh->b_blocknr);
446
447 /* We aren't guaranteed to have the superblock here - but if we
448 * don't, it'll just crash. */
449 ocfs2_error(bh->b_assoc_map->host->i_sb,
450 "JBD2 has aborted our journal, ocfs2 cannot continue\n");
451}
452
453static struct ocfs2_triggers di_triggers = {
454 .ot_triggers = {
455 .t_commit = ocfs2_commit_trigger,
456 .t_abort = ocfs2_abort_trigger,
457 },
458 .ot_offset = offsetof(struct ocfs2_dinode, i_check),
459};
460
461static struct ocfs2_triggers eb_triggers = {
462 .ot_triggers = {
463 .t_commit = ocfs2_commit_trigger,
464 .t_abort = ocfs2_abort_trigger,
465 },
466 .ot_offset = offsetof(struct ocfs2_extent_block, h_check),
467};
468
469static struct ocfs2_triggers gd_triggers = {
470 .ot_triggers = {
471 .t_commit = ocfs2_commit_trigger,
472 .t_abort = ocfs2_abort_trigger,
473 },
474 .ot_offset = offsetof(struct ocfs2_group_desc, bg_check),
475};
476
477static struct ocfs2_triggers db_triggers = {
478 .ot_triggers = {
479 .t_commit = ocfs2_db_commit_trigger,
480 .t_abort = ocfs2_abort_trigger,
481 },
482};
483
484static struct ocfs2_triggers xb_triggers = {
485 .ot_triggers = {
486 .t_commit = ocfs2_commit_trigger,
487 .t_abort = ocfs2_abort_trigger,
488 },
489 .ot_offset = offsetof(struct ocfs2_xattr_block, xb_check),
490};
491
492static struct ocfs2_triggers dq_triggers = {
493 .ot_triggers = {
494 .t_commit = ocfs2_dq_commit_trigger,
495 .t_abort = ocfs2_abort_trigger,
496 },
497};
498
499static int __ocfs2_journal_access(handle_t *handle,
500 struct inode *inode,
501 struct buffer_head *bh,
502 struct ocfs2_triggers *triggers,
503 int type)
364{ 504{
365 int status; 505 int status;
366 506
@@ -406,6 +546,8 @@ int ocfs2_journal_access(handle_t *handle,
406 status = -EINVAL; 546 status = -EINVAL;
407 mlog(ML_ERROR, "Uknown access type!\n"); 547 mlog(ML_ERROR, "Uknown access type!\n");
408 } 548 }
549 if (!status && ocfs2_meta_ecc(OCFS2_SB(inode->i_sb)) && triggers)
550 jbd2_journal_set_triggers(bh, &triggers->ot_triggers);
409 mutex_unlock(&OCFS2_I(inode)->ip_io_mutex); 551 mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
410 552
411 if (status < 0) 553 if (status < 0)
@@ -416,6 +558,54 @@ int ocfs2_journal_access(handle_t *handle,
416 return status; 558 return status;
417} 559}
418 560
561int ocfs2_journal_access_di(handle_t *handle, struct inode *inode,
562 struct buffer_head *bh, int type)
563{
564 return __ocfs2_journal_access(handle, inode, bh, &di_triggers,
565 type);
566}
567
568int ocfs2_journal_access_eb(handle_t *handle, struct inode *inode,
569 struct buffer_head *bh, int type)
570{
571 return __ocfs2_journal_access(handle, inode, bh, &eb_triggers,
572 type);
573}
574
575int ocfs2_journal_access_gd(handle_t *handle, struct inode *inode,
576 struct buffer_head *bh, int type)
577{
578 return __ocfs2_journal_access(handle, inode, bh, &gd_triggers,
579 type);
580}
581
582int ocfs2_journal_access_db(handle_t *handle, struct inode *inode,
583 struct buffer_head *bh, int type)
584{
585 return __ocfs2_journal_access(handle, inode, bh, &db_triggers,
586 type);
587}
588
589int ocfs2_journal_access_xb(handle_t *handle, struct inode *inode,
590 struct buffer_head *bh, int type)
591{
592 return __ocfs2_journal_access(handle, inode, bh, &xb_triggers,
593 type);
594}
595
596int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode,
597 struct buffer_head *bh, int type)
598{
599 return __ocfs2_journal_access(handle, inode, bh, &dq_triggers,
600 type);
601}
602
603int ocfs2_journal_access(handle_t *handle, struct inode *inode,
604 struct buffer_head *bh, int type)
605{
606 return __ocfs2_journal_access(handle, inode, bh, NULL, type);
607}
608
419int ocfs2_journal_dirty(handle_t *handle, 609int ocfs2_journal_dirty(handle_t *handle,
420 struct buffer_head *bh) 610 struct buffer_head *bh)
421{ 611{
@@ -434,20 +624,6 @@ int ocfs2_journal_dirty(handle_t *handle,
434 return status; 624 return status;
435} 625}
436 626
437#ifdef CONFIG_OCFS2_COMPAT_JBD
438int ocfs2_journal_dirty_data(handle_t *handle,
439 struct buffer_head *bh)
440{
441 int err = journal_dirty_data(handle, bh);
442 if (err)
443 mlog_errno(err);
444 /* TODO: When we can handle it, abort the handle and go RO on
445 * error here. */
446
447 return err;
448}
449#endif
450
451#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE) 627#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE)
452 628
453void ocfs2_set_journal_params(struct ocfs2_super *osb) 629void ocfs2_set_journal_params(struct ocfs2_super *osb)
@@ -587,17 +763,11 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
587 mlog_entry_void(); 763 mlog_entry_void();
588 764
589 fe = (struct ocfs2_dinode *)bh->b_data; 765 fe = (struct ocfs2_dinode *)bh->b_data;
590 if (!OCFS2_IS_VALID_DINODE(fe)) { 766
591 /* This is called from startup/shutdown which will 767 /* The journal bh on the osb always comes from ocfs2_journal_init()
592 * handle the errors in a specific manner, so no need 768 * and was validated there inside ocfs2_inode_lock_full(). It's a
593 * to call ocfs2_error() here. */ 769 * code bug if we mess it up. */
594 mlog(ML_ERROR, "Journal dinode %llu has invalid " 770 BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
595 "signature: %.*s",
596 (unsigned long long)le64_to_cpu(fe->i_blkno), 7,
597 fe->i_signature);
598 status = -EIO;
599 goto out;
600 }
601 771
602 flags = le32_to_cpu(fe->id1.journal1.ij_flags); 772 flags = le32_to_cpu(fe->id1.journal1.ij_flags);
603 if (dirty) 773 if (dirty)
@@ -609,11 +779,11 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
609 if (replayed) 779 if (replayed)
610 ocfs2_bump_recovery_generation(fe); 780 ocfs2_bump_recovery_generation(fe);
611 781
782 ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &fe->i_check);
612 status = ocfs2_write_block(osb, bh, journal->j_inode); 783 status = ocfs2_write_block(osb, bh, journal->j_inode);
613 if (status < 0) 784 if (status < 0)
614 mlog_errno(status); 785 mlog_errno(status);
615 786
616out:
617 mlog_exit(status); 787 mlog_exit(status);
618 return status; 788 return status;
619} 789}
@@ -878,6 +1048,7 @@ struct ocfs2_la_recovery_item {
878 int lri_slot; 1048 int lri_slot;
879 struct ocfs2_dinode *lri_la_dinode; 1049 struct ocfs2_dinode *lri_la_dinode;
880 struct ocfs2_dinode *lri_tl_dinode; 1050 struct ocfs2_dinode *lri_tl_dinode;
1051 struct ocfs2_quota_recovery *lri_qrec;
881}; 1052};
882 1053
883/* Does the second half of the recovery process. By this point, the 1054/* Does the second half of the recovery process. By this point, the
@@ -898,6 +1069,7 @@ void ocfs2_complete_recovery(struct work_struct *work)
898 struct ocfs2_super *osb = journal->j_osb; 1069 struct ocfs2_super *osb = journal->j_osb;
899 struct ocfs2_dinode *la_dinode, *tl_dinode; 1070 struct ocfs2_dinode *la_dinode, *tl_dinode;
900 struct ocfs2_la_recovery_item *item, *n; 1071 struct ocfs2_la_recovery_item *item, *n;
1072 struct ocfs2_quota_recovery *qrec;
901 LIST_HEAD(tmp_la_list); 1073 LIST_HEAD(tmp_la_list);
902 1074
903 mlog_entry_void(); 1075 mlog_entry_void();
@@ -913,6 +1085,8 @@ void ocfs2_complete_recovery(struct work_struct *work)
913 1085
914 mlog(0, "Complete recovery for slot %d\n", item->lri_slot); 1086 mlog(0, "Complete recovery for slot %d\n", item->lri_slot);
915 1087
1088 ocfs2_wait_on_quotas(osb);
1089
916 la_dinode = item->lri_la_dinode; 1090 la_dinode = item->lri_la_dinode;
917 if (la_dinode) { 1091 if (la_dinode) {
918 mlog(0, "Clean up local alloc %llu\n", 1092 mlog(0, "Clean up local alloc %llu\n",
@@ -943,6 +1117,16 @@ void ocfs2_complete_recovery(struct work_struct *work)
943 if (ret < 0) 1117 if (ret < 0)
944 mlog_errno(ret); 1118 mlog_errno(ret);
945 1119
1120 qrec = item->lri_qrec;
1121 if (qrec) {
1122 mlog(0, "Recovering quota files");
1123 ret = ocfs2_finish_quota_recovery(osb, qrec,
1124 item->lri_slot);
1125 if (ret < 0)
1126 mlog_errno(ret);
1127 /* Recovery info is already freed now */
1128 }
1129
946 kfree(item); 1130 kfree(item);
947 } 1131 }
948 1132
@@ -956,7 +1140,8 @@ void ocfs2_complete_recovery(struct work_struct *work)
956static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal, 1140static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
957 int slot_num, 1141 int slot_num,
958 struct ocfs2_dinode *la_dinode, 1142 struct ocfs2_dinode *la_dinode,
959 struct ocfs2_dinode *tl_dinode) 1143 struct ocfs2_dinode *tl_dinode,
1144 struct ocfs2_quota_recovery *qrec)
960{ 1145{
961 struct ocfs2_la_recovery_item *item; 1146 struct ocfs2_la_recovery_item *item;
962 1147
@@ -971,6 +1156,9 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
971 if (tl_dinode) 1156 if (tl_dinode)
972 kfree(tl_dinode); 1157 kfree(tl_dinode);
973 1158
1159 if (qrec)
1160 ocfs2_free_quota_recovery(qrec);
1161
974 mlog_errno(-ENOMEM); 1162 mlog_errno(-ENOMEM);
975 return; 1163 return;
976 } 1164 }
@@ -979,6 +1167,7 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
979 item->lri_la_dinode = la_dinode; 1167 item->lri_la_dinode = la_dinode;
980 item->lri_slot = slot_num; 1168 item->lri_slot = slot_num;
981 item->lri_tl_dinode = tl_dinode; 1169 item->lri_tl_dinode = tl_dinode;
1170 item->lri_qrec = qrec;
982 1171
983 spin_lock(&journal->j_lock); 1172 spin_lock(&journal->j_lock);
984 list_add_tail(&item->lri_list, &journal->j_la_cleanups); 1173 list_add_tail(&item->lri_list, &journal->j_la_cleanups);
@@ -998,6 +1187,7 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
998 ocfs2_queue_recovery_completion(journal, 1187 ocfs2_queue_recovery_completion(journal,
999 osb->slot_num, 1188 osb->slot_num,
1000 osb->local_alloc_copy, 1189 osb->local_alloc_copy,
1190 NULL,
1001 NULL); 1191 NULL);
1002 ocfs2_schedule_truncate_log_flush(osb, 0); 1192 ocfs2_schedule_truncate_log_flush(osb, 0);
1003 1193
@@ -1006,11 +1196,26 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
1006 } 1196 }
1007} 1197}
1008 1198
1199void ocfs2_complete_quota_recovery(struct ocfs2_super *osb)
1200{
1201 if (osb->quota_rec) {
1202 ocfs2_queue_recovery_completion(osb->journal,
1203 osb->slot_num,
1204 NULL,
1205 NULL,
1206 osb->quota_rec);
1207 osb->quota_rec = NULL;
1208 }
1209}
1210
1009static int __ocfs2_recovery_thread(void *arg) 1211static int __ocfs2_recovery_thread(void *arg)
1010{ 1212{
1011 int status, node_num; 1213 int status, node_num, slot_num;
1012 struct ocfs2_super *osb = arg; 1214 struct ocfs2_super *osb = arg;
1013 struct ocfs2_recovery_map *rm = osb->recovery_map; 1215 struct ocfs2_recovery_map *rm = osb->recovery_map;
1216 int *rm_quota = NULL;
1217 int rm_quota_used = 0, i;
1218 struct ocfs2_quota_recovery *qrec;
1014 1219
1015 mlog_entry_void(); 1220 mlog_entry_void();
1016 1221
@@ -1019,6 +1224,11 @@ static int __ocfs2_recovery_thread(void *arg)
1019 goto bail; 1224 goto bail;
1020 } 1225 }
1021 1226
1227 rm_quota = kzalloc(osb->max_slots * sizeof(int), GFP_NOFS);
1228 if (!rm_quota) {
1229 status = -ENOMEM;
1230 goto bail;
1231 }
1022restart: 1232restart:
1023 status = ocfs2_super_lock(osb, 1); 1233 status = ocfs2_super_lock(osb, 1);
1024 if (status < 0) { 1234 if (status < 0) {
@@ -1032,8 +1242,28 @@ restart:
1032 * clear it until ocfs2_recover_node() has succeeded. */ 1242 * clear it until ocfs2_recover_node() has succeeded. */
1033 node_num = rm->rm_entries[0]; 1243 node_num = rm->rm_entries[0];
1034 spin_unlock(&osb->osb_lock); 1244 spin_unlock(&osb->osb_lock);
1035 1245 mlog(0, "checking node %d\n", node_num);
1036 status = ocfs2_recover_node(osb, node_num); 1246 slot_num = ocfs2_node_num_to_slot(osb, node_num);
1247 if (slot_num == -ENOENT) {
1248 status = 0;
1249 mlog(0, "no slot for this node, so no recovery"
1250 "required.\n");
1251 goto skip_recovery;
1252 }
1253 mlog(0, "node %d was using slot %d\n", node_num, slot_num);
1254
1255 /* It is a bit subtle with quota recovery. We cannot do it
1256 * immediately because we have to obtain cluster locks from
1257 * quota files and we also don't want to just skip it because
1258 * then quota usage would be out of sync until some node takes
1259 * the slot. So we remember which nodes need quota recovery
1260 * and when everything else is done, we recover quotas. */
1261 for (i = 0; i < rm_quota_used && rm_quota[i] != slot_num; i++);
1262 if (i == rm_quota_used)
1263 rm_quota[rm_quota_used++] = slot_num;
1264
1265 status = ocfs2_recover_node(osb, node_num, slot_num);
1266skip_recovery:
1037 if (!status) { 1267 if (!status) {
1038 ocfs2_recovery_map_clear(osb, node_num); 1268 ocfs2_recovery_map_clear(osb, node_num);
1039 } else { 1269 } else {
@@ -1055,13 +1285,27 @@ restart:
1055 if (status < 0) 1285 if (status < 0)
1056 mlog_errno(status); 1286 mlog_errno(status);
1057 1287
1288 /* Now it is right time to recover quotas... We have to do this under
1289 * superblock lock so that noone can start using the slot (and crash)
1290 * before we recover it */
1291 for (i = 0; i < rm_quota_used; i++) {
1292 qrec = ocfs2_begin_quota_recovery(osb, rm_quota[i]);
1293 if (IS_ERR(qrec)) {
1294 status = PTR_ERR(qrec);
1295 mlog_errno(status);
1296 continue;
1297 }
1298 ocfs2_queue_recovery_completion(osb->journal, rm_quota[i],
1299 NULL, NULL, qrec);
1300 }
1301
1058 ocfs2_super_unlock(osb, 1); 1302 ocfs2_super_unlock(osb, 1);
1059 1303
1060 /* We always run recovery on our own orphan dir - the dead 1304 /* We always run recovery on our own orphan dir - the dead
1061 * node(s) may have disallowd a previos inode delete. Re-processing 1305 * node(s) may have disallowd a previos inode delete. Re-processing
1062 * is therefore required. */ 1306 * is therefore required. */
1063 ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL, 1307 ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
1064 NULL); 1308 NULL, NULL);
1065 1309
1066bail: 1310bail:
1067 mutex_lock(&osb->recovery_lock); 1311 mutex_lock(&osb->recovery_lock);
@@ -1076,6 +1320,9 @@ bail:
1076 1320
1077 mutex_unlock(&osb->recovery_lock); 1321 mutex_unlock(&osb->recovery_lock);
1078 1322
1323 if (rm_quota)
1324 kfree(rm_quota);
1325
1079 mlog_exit(status); 1326 mlog_exit(status);
1080 /* no one is callint kthread_stop() for us so the kthread() api 1327 /* no one is callint kthread_stop() for us so the kthread() api
1081 * requires that we call do_exit(). And it isn't exported, but 1328 * requires that we call do_exit(). And it isn't exported, but
@@ -1135,8 +1382,7 @@ static int ocfs2_read_journal_inode(struct ocfs2_super *osb,
1135 } 1382 }
1136 SET_INODE_JOURNAL(inode); 1383 SET_INODE_JOURNAL(inode);
1137 1384
1138 status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, bh, 1385 status = ocfs2_read_inode_block_full(inode, bh, OCFS2_BH_IGNORE_CACHE);
1139 OCFS2_BH_IGNORE_CACHE);
1140 if (status < 0) { 1386 if (status < 0) {
1141 mlog_errno(status); 1387 mlog_errno(status);
1142 goto bail; 1388 goto bail;
@@ -1268,6 +1514,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
1268 osb->slot_recovery_generations[slot_num] = 1514 osb->slot_recovery_generations[slot_num] =
1269 ocfs2_get_recovery_generation(fe); 1515 ocfs2_get_recovery_generation(fe);
1270 1516
1517 ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &fe->i_check);
1271 status = ocfs2_write_block(osb, bh, inode); 1518 status = ocfs2_write_block(osb, bh, inode);
1272 if (status < 0) 1519 if (status < 0)
1273 mlog_errno(status); 1520 mlog_errno(status);
@@ -1304,31 +1551,19 @@ done:
1304 * far less concerning. 1551 * far less concerning.
1305 */ 1552 */
1306static int ocfs2_recover_node(struct ocfs2_super *osb, 1553static int ocfs2_recover_node(struct ocfs2_super *osb,
1307 int node_num) 1554 int node_num, int slot_num)
1308{ 1555{
1309 int status = 0; 1556 int status = 0;
1310 int slot_num;
1311 struct ocfs2_dinode *la_copy = NULL; 1557 struct ocfs2_dinode *la_copy = NULL;
1312 struct ocfs2_dinode *tl_copy = NULL; 1558 struct ocfs2_dinode *tl_copy = NULL;
1313 1559
1314 mlog_entry("(node_num=%d, osb->node_num = %d)\n", 1560 mlog_entry("(node_num=%d, slot_num=%d, osb->node_num = %d)\n",
1315 node_num, osb->node_num); 1561 node_num, slot_num, osb->node_num);
1316
1317 mlog(0, "checking node %d\n", node_num);
1318 1562
1319 /* Should not ever be called to recover ourselves -- in that 1563 /* Should not ever be called to recover ourselves -- in that
1320 * case we should've called ocfs2_journal_load instead. */ 1564 * case we should've called ocfs2_journal_load instead. */
1321 BUG_ON(osb->node_num == node_num); 1565 BUG_ON(osb->node_num == node_num);
1322 1566
1323 slot_num = ocfs2_node_num_to_slot(osb, node_num);
1324 if (slot_num == -ENOENT) {
1325 status = 0;
1326 mlog(0, "no slot for this node, so no recovery required.\n");
1327 goto done;
1328 }
1329
1330 mlog(0, "node %d was using slot %d\n", node_num, slot_num);
1331
1332 status = ocfs2_replay_journal(osb, node_num, slot_num); 1567 status = ocfs2_replay_journal(osb, node_num, slot_num);
1333 if (status < 0) { 1568 if (status < 0) {
1334 if (status == -EBUSY) { 1569 if (status == -EBUSY) {
@@ -1364,7 +1599,7 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
1364 1599
1365 /* This will kfree the memory pointed to by la_copy and tl_copy */ 1600 /* This will kfree the memory pointed to by la_copy and tl_copy */
1366 ocfs2_queue_recovery_completion(osb->journal, slot_num, la_copy, 1601 ocfs2_queue_recovery_completion(osb->journal, slot_num, la_copy,
1367 tl_copy); 1602 tl_copy, NULL);
1368 1603
1369 status = 0; 1604 status = 0;
1370done: 1605done:
@@ -1659,13 +1894,14 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
1659 return ret; 1894 return ret;
1660} 1895}
1661 1896
1662static int ocfs2_wait_on_mount(struct ocfs2_super *osb) 1897static int __ocfs2_wait_on_mount(struct ocfs2_super *osb, int quota)
1663{ 1898{
1664 /* This check is good because ocfs2 will wait on our recovery 1899 /* This check is good because ocfs2 will wait on our recovery
1665 * thread before changing it to something other than MOUNTED 1900 * thread before changing it to something other than MOUNTED
1666 * or DISABLED. */ 1901 * or DISABLED. */
1667 wait_event(osb->osb_mount_event, 1902 wait_event(osb->osb_mount_event,
1668 atomic_read(&osb->vol_state) == VOLUME_MOUNTED || 1903 (!quota && atomic_read(&osb->vol_state) == VOLUME_MOUNTED) ||
1904 atomic_read(&osb->vol_state) == VOLUME_MOUNTED_QUOTAS ||
1669 atomic_read(&osb->vol_state) == VOLUME_DISABLED); 1905 atomic_read(&osb->vol_state) == VOLUME_DISABLED);
1670 1906
1671 /* If there's an error on mount, then we may never get to the 1907 /* If there's an error on mount, then we may never get to the
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index d4d14e9a3cea..3c3532e1307c 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -27,12 +27,7 @@
27#define OCFS2_JOURNAL_H 27#define OCFS2_JOURNAL_H
28 28
29#include <linux/fs.h> 29#include <linux/fs.h>
30#ifndef CONFIG_OCFS2_COMPAT_JBD 30#include <linux/jbd2.h>
31# include <linux/jbd2.h>
32#else
33# include <linux/jbd.h>
34# include "ocfs2_jbd_compat.h"
35#endif
36 31
37enum ocfs2_journal_state { 32enum ocfs2_journal_state {
38 OCFS2_JOURNAL_FREE = 0, 33 OCFS2_JOURNAL_FREE = 0,
@@ -173,6 +168,7 @@ void ocfs2_recovery_thread(struct ocfs2_super *osb,
173 int node_num); 168 int node_num);
174int ocfs2_mark_dead_nodes(struct ocfs2_super *osb); 169int ocfs2_mark_dead_nodes(struct ocfs2_super *osb);
175void ocfs2_complete_mount_recovery(struct ocfs2_super *osb); 170void ocfs2_complete_mount_recovery(struct ocfs2_super *osb);
171void ocfs2_complete_quota_recovery(struct ocfs2_super *osb);
176 172
177static inline void ocfs2_start_checkpoint(struct ocfs2_super *osb) 173static inline void ocfs2_start_checkpoint(struct ocfs2_super *osb)
178{ 174{
@@ -216,9 +212,12 @@ static inline void ocfs2_checkpoint_inode(struct inode *inode)
216 * ocfs2_extend_trans - Extend a handle by nblocks credits. This may 212 * ocfs2_extend_trans - Extend a handle by nblocks credits. This may
217 * commit the handle to disk in the process, but will 213 * commit the handle to disk in the process, but will
218 * not release any locks taken during the transaction. 214 * not release any locks taken during the transaction.
219 * ocfs2_journal_access - Notify the handle that we want to journal this 215 * ocfs2_journal_access* - Notify the handle that we want to journal this
220 * buffer. Will have to call ocfs2_journal_dirty once 216 * buffer. Will have to call ocfs2_journal_dirty once
221 * we've actually dirtied it. Type is one of . or . 217 * we've actually dirtied it. Type is one of . or .
218 * Always call the specific flavor of
219 * ocfs2_journal_access_*() unless you intend to
220 * manage the checksum by hand.
222 * ocfs2_journal_dirty - Mark a journalled buffer as having dirty data. 221 * ocfs2_journal_dirty - Mark a journalled buffer as having dirty data.
223 * ocfs2_jbd2_file_inode - Mark an inode so that its data goes out before 222 * ocfs2_jbd2_file_inode - Mark an inode so that its data goes out before
224 * the current handle commits. 223 * the current handle commits.
@@ -248,10 +247,29 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks);
248#define OCFS2_JOURNAL_ACCESS_WRITE 1 247#define OCFS2_JOURNAL_ACCESS_WRITE 1
249#define OCFS2_JOURNAL_ACCESS_UNDO 2 248#define OCFS2_JOURNAL_ACCESS_UNDO 2
250 249
251int ocfs2_journal_access(handle_t *handle, 250
252 struct inode *inode, 251/* ocfs2_inode */
253 struct buffer_head *bh, 252int ocfs2_journal_access_di(handle_t *handle, struct inode *inode,
254 int type); 253 struct buffer_head *bh, int type);
254/* ocfs2_extent_block */
255int ocfs2_journal_access_eb(handle_t *handle, struct inode *inode,
256 struct buffer_head *bh, int type);
257/* ocfs2_group_desc */
258int ocfs2_journal_access_gd(handle_t *handle, struct inode *inode,
259 struct buffer_head *bh, int type);
260/* ocfs2_xattr_block */
261int ocfs2_journal_access_xb(handle_t *handle, struct inode *inode,
262 struct buffer_head *bh, int type);
263/* quota blocks */
264int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode,
265 struct buffer_head *bh, int type);
266/* dirblock */
267int ocfs2_journal_access_db(handle_t *handle, struct inode *inode,
268 struct buffer_head *bh, int type);
269/* Anything that has no ecc */
270int ocfs2_journal_access(handle_t *handle, struct inode *inode,
271 struct buffer_head *bh, int type);
272
255/* 273/*
256 * A word about the journal_access/journal_dirty "dance". It is 274 * A word about the journal_access/journal_dirty "dance". It is
257 * entirely legal to journal_access a buffer more than once (as long 275 * entirely legal to journal_access a buffer more than once (as long
@@ -273,10 +291,6 @@ int ocfs2_journal_access(handle_t *handle,
273 */ 291 */
274int ocfs2_journal_dirty(handle_t *handle, 292int ocfs2_journal_dirty(handle_t *handle,
275 struct buffer_head *bh); 293 struct buffer_head *bh);
276#ifdef CONFIG_OCFS2_COMPAT_JBD
277int ocfs2_journal_dirty_data(handle_t *handle,
278 struct buffer_head *bh);
279#endif
280 294
281/* 295/*
282 * Credit Macros: 296 * Credit Macros:
@@ -293,6 +307,37 @@ int ocfs2_journal_dirty_data(handle_t *handle,
293/* extended attribute block update */ 307/* extended attribute block update */
294#define OCFS2_XATTR_BLOCK_UPDATE_CREDITS 1 308#define OCFS2_XATTR_BLOCK_UPDATE_CREDITS 1
295 309
310/* global quotafile inode update, data block */
311#define OCFS2_QINFO_WRITE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
312
313/*
314 * The two writes below can accidentally see global info dirty due
315 * to set_info() quotactl so make them prepared for the writes.
316 */
317/* quota data block, global info */
318/* Write to local quota file */
319#define OCFS2_QWRITE_CREDITS (OCFS2_QINFO_WRITE_CREDITS + 1)
320
321/* global quota data block, local quota data block, global quota inode,
322 * global quota info */
323#define OCFS2_QSYNC_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 3)
324
325static inline int ocfs2_quota_trans_credits(struct super_block *sb)
326{
327 int credits = 0;
328
329 if (OCFS2_HAS_RO_COMPAT_FEATURE(sb, OCFS2_FEATURE_RO_COMPAT_USRQUOTA))
330 credits += OCFS2_QWRITE_CREDITS;
331 if (OCFS2_HAS_RO_COMPAT_FEATURE(sb, OCFS2_FEATURE_RO_COMPAT_GRPQUOTA))
332 credits += OCFS2_QWRITE_CREDITS;
333 return credits;
334}
335
336/* Number of credits needed for removing quota structure from file */
337int ocfs2_calc_qdel_credits(struct super_block *sb, int type);
338/* Number of credits needed for initialization of new quota structure */
339int ocfs2_calc_qinit_credits(struct super_block *sb, int type);
340
296/* group extend. inode update and last group update. */ 341/* group extend. inode update and last group update. */
297#define OCFS2_GROUP_EXTEND_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1) 342#define OCFS2_GROUP_EXTEND_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
298 343
@@ -303,8 +348,11 @@ int ocfs2_journal_dirty_data(handle_t *handle,
303 * prev. group desc. if we relink. */ 348 * prev. group desc. if we relink. */
304#define OCFS2_SUBALLOC_ALLOC (3) 349#define OCFS2_SUBALLOC_ALLOC (3)
305 350
306#define OCFS2_INLINE_TO_EXTENTS_CREDITS (OCFS2_SUBALLOC_ALLOC \ 351static inline int ocfs2_inline_to_extents_credits(struct super_block *sb)
307 + OCFS2_INODE_UPDATE_CREDITS) 352{
353 return OCFS2_SUBALLOC_ALLOC + OCFS2_INODE_UPDATE_CREDITS +
354 ocfs2_quota_trans_credits(sb);
355}
308 356
309/* dinode + group descriptor update. We don't relink on free yet. */ 357/* dinode + group descriptor update. We don't relink on free yet. */
310#define OCFS2_SUBALLOC_FREE (2) 358#define OCFS2_SUBALLOC_FREE (2)
@@ -313,16 +361,23 @@ int ocfs2_journal_dirty_data(handle_t *handle,
313#define OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC (OCFS2_SUBALLOC_FREE \ 361#define OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC (OCFS2_SUBALLOC_FREE \
314 + OCFS2_TRUNCATE_LOG_UPDATE) 362 + OCFS2_TRUNCATE_LOG_UPDATE)
315 363
316#define OCFS2_REMOVE_EXTENT_CREDITS (OCFS2_TRUNCATE_LOG_UPDATE + OCFS2_INODE_UPDATE_CREDITS) 364static inline int ocfs2_remove_extent_credits(struct super_block *sb)
365{
366 return OCFS2_TRUNCATE_LOG_UPDATE + OCFS2_INODE_UPDATE_CREDITS +
367 ocfs2_quota_trans_credits(sb);
368}
317 369
318/* data block for new dir/symlink, 2 for bitmap updates (bitmap fe + 370/* data block for new dir/symlink, 2 for bitmap updates (bitmap fe +
319 * bitmap block for the new bit) */ 371 * bitmap block for the new bit) */
320#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2) 372#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2)
321 373
322/* parent fe, parent block, new file entry, inode alloc fe, inode alloc 374/* parent fe, parent block, new file entry, inode alloc fe, inode alloc
323 * group descriptor + mkdir/symlink blocks */ 375 * group descriptor + mkdir/symlink blocks + quota update */
324#define OCFS2_MKNOD_CREDITS (3 + OCFS2_SUBALLOC_ALLOC \ 376static inline int ocfs2_mknod_credits(struct super_block *sb)
325 + OCFS2_DIR_LINK_ADDITIONAL_CREDITS) 377{
378 return 3 + OCFS2_SUBALLOC_ALLOC + OCFS2_DIR_LINK_ADDITIONAL_CREDITS +
379 ocfs2_quota_trans_credits(sb);
380}
326 381
327/* local alloc metadata change + main bitmap updates */ 382/* local alloc metadata change + main bitmap updates */
328#define OCFS2_WINDOW_MOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS \ 383#define OCFS2_WINDOW_MOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS \
@@ -332,13 +387,21 @@ int ocfs2_journal_dirty_data(handle_t *handle,
332 * for the dinode, one for the new block. */ 387 * for the dinode, one for the new block. */
333#define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2) 388#define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2)
334 389
335/* file update (nlink, etc) + directory mtime/ctime + dir entry block */ 390/* file update (nlink, etc) + directory mtime/ctime + dir entry block + quota
336#define OCFS2_LINK_CREDITS (2*OCFS2_INODE_UPDATE_CREDITS + 1) 391 * update on dir */
392static inline int ocfs2_link_credits(struct super_block *sb)
393{
394 return 2*OCFS2_INODE_UPDATE_CREDITS + 1 +
395 ocfs2_quota_trans_credits(sb);
396}
337 397
338/* inode + dir inode (if we unlink a dir), + dir entry block + orphan 398/* inode + dir inode (if we unlink a dir), + dir entry block + orphan
339 * dir inode link */ 399 * dir inode link */
340#define OCFS2_UNLINK_CREDITS (2 * OCFS2_INODE_UPDATE_CREDITS + 1 \ 400static inline int ocfs2_unlink_credits(struct super_block *sb)
341 + OCFS2_LINK_CREDITS) 401{
402 /* The quota update from ocfs2_link_credits is unused here... */
403 return 2 * OCFS2_INODE_UPDATE_CREDITS + 1 + ocfs2_link_credits(sb);
404}
342 405
343/* dinode + orphan dir dinode + inode alloc dinode + orphan dir entry + 406/* dinode + orphan dir dinode + inode alloc dinode + orphan dir entry +
344 * inode alloc group descriptor */ 407 * inode alloc group descriptor */
@@ -347,8 +410,10 @@ int ocfs2_journal_dirty_data(handle_t *handle,
347/* dinode update, old dir dinode update, new dir dinode update, old 410/* dinode update, old dir dinode update, new dir dinode update, old
348 * dir dir entry, new dir dir entry, dir entry update for renaming 411 * dir dir entry, new dir dir entry, dir entry update for renaming
349 * directory + target unlink */ 412 * directory + target unlink */
350#define OCFS2_RENAME_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 3 \ 413static inline int ocfs2_rename_credits(struct super_block *sb)
351 + OCFS2_UNLINK_CREDITS) 414{
415 return 3 * OCFS2_INODE_UPDATE_CREDITS + 3 + ocfs2_unlink_credits(sb);
416}
352 417
353/* global bitmap dinode, group desc., relinked group, 418/* global bitmap dinode, group desc., relinked group,
354 * suballocator dinode, group desc., relinked group, 419 * suballocator dinode, group desc., relinked group,
@@ -386,18 +451,19 @@ static inline int ocfs2_calc_extend_credits(struct super_block *sb,
386 * credit for the dinode there. */ 451 * credit for the dinode there. */
387 extent_blocks = 1 + 1 + le16_to_cpu(root_el->l_tree_depth); 452 extent_blocks = 1 + 1 + le16_to_cpu(root_el->l_tree_depth);
388 453
389 return bitmap_blocks + sysfile_bitmap_blocks + extent_blocks; 454 return bitmap_blocks + sysfile_bitmap_blocks + extent_blocks +
455 ocfs2_quota_trans_credits(sb);
390} 456}
391 457
392static inline int ocfs2_calc_symlink_credits(struct super_block *sb) 458static inline int ocfs2_calc_symlink_credits(struct super_block *sb)
393{ 459{
394 int blocks = OCFS2_MKNOD_CREDITS; 460 int blocks = ocfs2_mknod_credits(sb);
395 461
396 /* links can be longer than one block so we may update many 462 /* links can be longer than one block so we may update many
397 * within our single allocated extent. */ 463 * within our single allocated extent. */
398 blocks += ocfs2_clusters_to_blocks(sb, 1); 464 blocks += ocfs2_clusters_to_blocks(sb, 1);
399 465
400 return blocks; 466 return blocks + ocfs2_quota_trans_credits(sb);
401} 467}
402 468
403static inline int ocfs2_calc_group_alloc_credits(struct super_block *sb, 469static inline int ocfs2_calc_group_alloc_credits(struct super_block *sb,
@@ -434,6 +500,8 @@ static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
434 /* update to the truncate log. */ 500 /* update to the truncate log. */
435 credits += OCFS2_TRUNCATE_LOG_UPDATE; 501 credits += OCFS2_TRUNCATE_LOG_UPDATE;
436 502
503 credits += ocfs2_quota_trans_credits(sb);
504
437 return credits; 505 return credits;
438} 506}
439 507
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 687b28713c32..ec70cdbe77fc 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -36,6 +36,7 @@
36#include "ocfs2.h" 36#include "ocfs2.h"
37 37
38#include "alloc.h" 38#include "alloc.h"
39#include "blockcheck.h"
39#include "dlmglue.h" 40#include "dlmglue.h"
40#include "inode.h" 41#include "inode.h"
41#include "journal.h" 42#include "journal.h"
@@ -248,8 +249,8 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
248 goto bail; 249 goto bail;
249 } 250 }
250 251
251 status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, 252 status = ocfs2_read_inode_block_full(inode, &alloc_bh,
252 &alloc_bh, OCFS2_BH_IGNORE_CACHE); 253 OCFS2_BH_IGNORE_CACHE);
253 if (status < 0) { 254 if (status < 0) {
254 mlog_errno(status); 255 mlog_errno(status);
255 goto bail; 256 goto bail;
@@ -382,8 +383,8 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
382 } 383 }
383 memcpy(alloc_copy, alloc, bh->b_size); 384 memcpy(alloc_copy, alloc, bh->b_size);
384 385
385 status = ocfs2_journal_access(handle, local_alloc_inode, bh, 386 status = ocfs2_journal_access_di(handle, local_alloc_inode, bh,
386 OCFS2_JOURNAL_ACCESS_WRITE); 387 OCFS2_JOURNAL_ACCESS_WRITE);
387 if (status < 0) { 388 if (status < 0) {
388 mlog_errno(status); 389 mlog_errno(status);
389 goto out_commit; 390 goto out_commit;
@@ -459,8 +460,8 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
459 460
460 mutex_lock(&inode->i_mutex); 461 mutex_lock(&inode->i_mutex);
461 462
462 status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, 463 status = ocfs2_read_inode_block_full(inode, &alloc_bh,
463 &alloc_bh, OCFS2_BH_IGNORE_CACHE); 464 OCFS2_BH_IGNORE_CACHE);
464 if (status < 0) { 465 if (status < 0) {
465 mlog_errno(status); 466 mlog_errno(status);
466 goto bail; 467 goto bail;
@@ -476,6 +477,7 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
476 alloc = (struct ocfs2_dinode *) alloc_bh->b_data; 477 alloc = (struct ocfs2_dinode *) alloc_bh->b_data;
477 ocfs2_clear_local_alloc(alloc); 478 ocfs2_clear_local_alloc(alloc);
478 479
480 ocfs2_compute_meta_ecc(osb->sb, alloc_bh->b_data, &alloc->i_check);
479 status = ocfs2_write_block(osb, alloc_bh, inode); 481 status = ocfs2_write_block(osb, alloc_bh, inode);
480 if (status < 0) 482 if (status < 0)
481 mlog_errno(status); 483 mlog_errno(status);
@@ -762,9 +764,9 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
762 * delete bits from it! */ 764 * delete bits from it! */
763 *num_bits = bits_wanted; 765 *num_bits = bits_wanted;
764 766
765 status = ocfs2_journal_access(handle, local_alloc_inode, 767 status = ocfs2_journal_access_di(handle, local_alloc_inode,
766 osb->local_alloc_bh, 768 osb->local_alloc_bh,
767 OCFS2_JOURNAL_ACCESS_WRITE); 769 OCFS2_JOURNAL_ACCESS_WRITE);
768 if (status < 0) { 770 if (status < 0) {
769 mlog_errno(status); 771 mlog_errno(status);
770 goto bail; 772 goto bail;
@@ -1240,9 +1242,9 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
1240 } 1242 }
1241 memcpy(alloc_copy, alloc, osb->local_alloc_bh->b_size); 1243 memcpy(alloc_copy, alloc, osb->local_alloc_bh->b_size);
1242 1244
1243 status = ocfs2_journal_access(handle, local_alloc_inode, 1245 status = ocfs2_journal_access_di(handle, local_alloc_inode,
1244 osb->local_alloc_bh, 1246 osb->local_alloc_bh,
1245 OCFS2_JOURNAL_ACCESS_WRITE); 1247 OCFS2_JOURNAL_ACCESS_WRITE);
1246 if (status < 0) { 1248 if (status < 0) {
1247 mlog_errno(status); 1249 mlog_errno(status);
1248 goto bail; 1250 goto bail;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 2545e7402efe..084aba86c3b2 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -40,6 +40,7 @@
40#include <linux/types.h> 40#include <linux/types.h>
41#include <linux/slab.h> 41#include <linux/slab.h>
42#include <linux/highmem.h> 42#include <linux/highmem.h>
43#include <linux/quotaops.h>
43 44
44#define MLOG_MASK_PREFIX ML_NAMEI 45#define MLOG_MASK_PREFIX ML_NAMEI
45#include <cluster/masklog.h> 46#include <cluster/masklog.h>
@@ -61,17 +62,18 @@
61#include "sysfile.h" 62#include "sysfile.h"
62#include "uptodate.h" 63#include "uptodate.h"
63#include "xattr.h" 64#include "xattr.h"
65#include "acl.h"
64 66
65#include "buffer_head_io.h" 67#include "buffer_head_io.h"
66 68
67static int ocfs2_mknod_locked(struct ocfs2_super *osb, 69static int ocfs2_mknod_locked(struct ocfs2_super *osb,
68 struct inode *dir, 70 struct inode *dir,
69 struct dentry *dentry, int mode, 71 struct inode *inode,
72 struct dentry *dentry,
70 dev_t dev, 73 dev_t dev,
71 struct buffer_head **new_fe_bh, 74 struct buffer_head **new_fe_bh,
72 struct buffer_head *parent_fe_bh, 75 struct buffer_head *parent_fe_bh,
73 handle_t *handle, 76 handle_t *handle,
74 struct inode **ret_inode,
75 struct ocfs2_alloc_context *inode_ac); 77 struct ocfs2_alloc_context *inode_ac);
76 78
77static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, 79static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
@@ -186,6 +188,35 @@ bail:
186 return ret; 188 return ret;
187} 189}
188 190
191static struct inode *ocfs2_get_init_inode(struct inode *dir, int mode)
192{
193 struct inode *inode;
194
195 inode = new_inode(dir->i_sb);
196 if (!inode) {
197 mlog(ML_ERROR, "new_inode failed!\n");
198 return NULL;
199 }
200
201 /* populate as many fields early on as possible - many of
202 * these are used by the support functions here and in
203 * callers. */
204 if (S_ISDIR(mode))
205 inode->i_nlink = 2;
206 else
207 inode->i_nlink = 1;
208 inode->i_uid = current_fsuid();
209 if (dir->i_mode & S_ISGID) {
210 inode->i_gid = dir->i_gid;
211 if (S_ISDIR(mode))
212 mode |= S_ISGID;
213 } else
214 inode->i_gid = current_fsgid();
215 inode->i_mode = mode;
216 vfs_dq_init(inode);
217 return inode;
218}
219
189static int ocfs2_mknod(struct inode *dir, 220static int ocfs2_mknod(struct inode *dir,
190 struct dentry *dentry, 221 struct dentry *dentry,
191 int mode, 222 int mode,
@@ -201,6 +232,13 @@ static int ocfs2_mknod(struct inode *dir,
201 struct inode *inode = NULL; 232 struct inode *inode = NULL;
202 struct ocfs2_alloc_context *inode_ac = NULL; 233 struct ocfs2_alloc_context *inode_ac = NULL;
203 struct ocfs2_alloc_context *data_ac = NULL; 234 struct ocfs2_alloc_context *data_ac = NULL;
235 struct ocfs2_alloc_context *xattr_ac = NULL;
236 int want_clusters = 0;
237 int xattr_credits = 0;
238 struct ocfs2_security_xattr_info si = {
239 .enable = 1,
240 };
241 int did_quota_inode = 0;
204 242
205 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode, 243 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
206 (unsigned long)dev, dentry->d_name.len, 244 (unsigned long)dev, dentry->d_name.len,
@@ -250,17 +288,46 @@ static int ocfs2_mknod(struct inode *dir,
250 goto leave; 288 goto leave;
251 } 289 }
252 290
253 /* Reserve a cluster if creating an extent based directory. */ 291 inode = ocfs2_get_init_inode(dir, mode);
254 if (S_ISDIR(mode) && !ocfs2_supports_inline_data(osb)) { 292 if (!inode) {
255 status = ocfs2_reserve_clusters(osb, 1, &data_ac); 293 status = -ENOMEM;
256 if (status < 0) { 294 mlog_errno(status);
257 if (status != -ENOSPC) 295 goto leave;
258 mlog_errno(status); 296 }
297
298 /* get security xattr */
299 status = ocfs2_init_security_get(inode, dir, &si);
300 if (status) {
301 if (status == -EOPNOTSUPP)
302 si.enable = 0;
303 else {
304 mlog_errno(status);
259 goto leave; 305 goto leave;
260 } 306 }
261 } 307 }
262 308
263 handle = ocfs2_start_trans(osb, OCFS2_MKNOD_CREDITS); 309 /* calculate meta data/clusters for setting security and acl xattr */
310 status = ocfs2_calc_xattr_init(dir, parent_fe_bh, mode,
311 &si, &want_clusters,
312 &xattr_credits, &xattr_ac);
313 if (status < 0) {
314 mlog_errno(status);
315 goto leave;
316 }
317
318 /* Reserve a cluster if creating an extent based directory. */
319 if (S_ISDIR(mode) && !ocfs2_supports_inline_data(osb))
320 want_clusters += 1;
321
322 status = ocfs2_reserve_clusters(osb, want_clusters, &data_ac);
323 if (status < 0) {
324 if (status != -ENOSPC)
325 mlog_errno(status);
326 goto leave;
327 }
328
329 handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb) +
330 xattr_credits);
264 if (IS_ERR(handle)) { 331 if (IS_ERR(handle)) {
265 status = PTR_ERR(handle); 332 status = PTR_ERR(handle);
266 handle = NULL; 333 handle = NULL;
@@ -268,10 +335,19 @@ static int ocfs2_mknod(struct inode *dir,
268 goto leave; 335 goto leave;
269 } 336 }
270 337
338 /* We don't use standard VFS wrapper because we don't want vfs_dq_init
339 * to be called. */
340 if (sb_any_quota_active(osb->sb) &&
341 osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) {
342 status = -EDQUOT;
343 goto leave;
344 }
345 did_quota_inode = 1;
346
271 /* do the real work now. */ 347 /* do the real work now. */
272 status = ocfs2_mknod_locked(osb, dir, dentry, mode, dev, 348 status = ocfs2_mknod_locked(osb, dir, inode, dentry, dev,
273 &new_fe_bh, parent_fe_bh, handle, 349 &new_fe_bh, parent_fe_bh, handle,
274 &inode, inode_ac); 350 inode_ac);
275 if (status < 0) { 351 if (status < 0) {
276 mlog_errno(status); 352 mlog_errno(status);
277 goto leave; 353 goto leave;
@@ -285,8 +361,8 @@ static int ocfs2_mknod(struct inode *dir,
285 goto leave; 361 goto leave;
286 } 362 }
287 363
288 status = ocfs2_journal_access(handle, dir, parent_fe_bh, 364 status = ocfs2_journal_access_di(handle, dir, parent_fe_bh,
289 OCFS2_JOURNAL_ACCESS_WRITE); 365 OCFS2_JOURNAL_ACCESS_WRITE);
290 if (status < 0) { 366 if (status < 0) {
291 mlog_errno(status); 367 mlog_errno(status);
292 goto leave; 368 goto leave;
@@ -300,6 +376,22 @@ static int ocfs2_mknod(struct inode *dir,
300 inc_nlink(dir); 376 inc_nlink(dir);
301 } 377 }
302 378
379 status = ocfs2_init_acl(handle, inode, dir, new_fe_bh, parent_fe_bh,
380 xattr_ac, data_ac);
381 if (status < 0) {
382 mlog_errno(status);
383 goto leave;
384 }
385
386 if (si.enable) {
387 status = ocfs2_init_security_set(handle, inode, new_fe_bh, &si,
388 xattr_ac, data_ac);
389 if (status < 0) {
390 mlog_errno(status);
391 goto leave;
392 }
393 }
394
303 status = ocfs2_add_entry(handle, dentry, inode, 395 status = ocfs2_add_entry(handle, dentry, inode,
304 OCFS2_I(inode)->ip_blkno, parent_fe_bh, 396 OCFS2_I(inode)->ip_blkno, parent_fe_bh,
305 de_bh); 397 de_bh);
@@ -320,6 +412,8 @@ static int ocfs2_mknod(struct inode *dir,
320 d_instantiate(dentry, inode); 412 d_instantiate(dentry, inode);
321 status = 0; 413 status = 0;
322leave: 414leave:
415 if (status < 0 && did_quota_inode)
416 vfs_dq_free_inode(inode);
323 if (handle) 417 if (handle)
324 ocfs2_commit_trans(osb, handle); 418 ocfs2_commit_trans(osb, handle);
325 419
@@ -331,9 +425,13 @@ leave:
331 brelse(new_fe_bh); 425 brelse(new_fe_bh);
332 brelse(de_bh); 426 brelse(de_bh);
333 brelse(parent_fe_bh); 427 brelse(parent_fe_bh);
428 kfree(si.name);
429 kfree(si.value);
334 430
335 if ((status < 0) && inode) 431 if ((status < 0) && inode) {
432 clear_nlink(inode);
336 iput(inode); 433 iput(inode);
434 }
337 435
338 if (inode_ac) 436 if (inode_ac)
339 ocfs2_free_alloc_context(inode_ac); 437 ocfs2_free_alloc_context(inode_ac);
@@ -341,6 +439,9 @@ leave:
341 if (data_ac) 439 if (data_ac)
342 ocfs2_free_alloc_context(data_ac); 440 ocfs2_free_alloc_context(data_ac);
343 441
442 if (xattr_ac)
443 ocfs2_free_alloc_context(xattr_ac);
444
344 mlog_exit(status); 445 mlog_exit(status);
345 446
346 return status; 447 return status;
@@ -348,12 +449,12 @@ leave:
348 449
349static int ocfs2_mknod_locked(struct ocfs2_super *osb, 450static int ocfs2_mknod_locked(struct ocfs2_super *osb,
350 struct inode *dir, 451 struct inode *dir,
351 struct dentry *dentry, int mode, 452 struct inode *inode,
453 struct dentry *dentry,
352 dev_t dev, 454 dev_t dev,
353 struct buffer_head **new_fe_bh, 455 struct buffer_head **new_fe_bh,
354 struct buffer_head *parent_fe_bh, 456 struct buffer_head *parent_fe_bh,
355 handle_t *handle, 457 handle_t *handle,
356 struct inode **ret_inode,
357 struct ocfs2_alloc_context *inode_ac) 458 struct ocfs2_alloc_context *inode_ac)
358{ 459{
359 int status = 0; 460 int status = 0;
@@ -361,14 +462,12 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
361 struct ocfs2_extent_list *fel; 462 struct ocfs2_extent_list *fel;
362 u64 fe_blkno = 0; 463 u64 fe_blkno = 0;
363 u16 suballoc_bit; 464 u16 suballoc_bit;
364 struct inode *inode = NULL;
365 465
366 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode, 466 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry,
367 (unsigned long)dev, dentry->d_name.len, 467 inode->i_mode, (unsigned long)dev, dentry->d_name.len,
368 dentry->d_name.name); 468 dentry->d_name.name);
369 469
370 *new_fe_bh = NULL; 470 *new_fe_bh = NULL;
371 *ret_inode = NULL;
372 471
373 status = ocfs2_claim_new_inode(osb, handle, inode_ac, &suballoc_bit, 472 status = ocfs2_claim_new_inode(osb, handle, inode_ac, &suballoc_bit,
374 &fe_blkno); 473 &fe_blkno);
@@ -377,23 +476,11 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
377 goto leave; 476 goto leave;
378 } 477 }
379 478
380 inode = new_inode(dir->i_sb);
381 if (!inode) {
382 status = -ENOMEM;
383 mlog(ML_ERROR, "new_inode failed!\n");
384 goto leave;
385 }
386
387 /* populate as many fields early on as possible - many of 479 /* populate as many fields early on as possible - many of
388 * these are used by the support functions here and in 480 * these are used by the support functions here and in
389 * callers. */ 481 * callers. */
390 inode->i_ino = ino_from_blkno(osb->sb, fe_blkno); 482 inode->i_ino = ino_from_blkno(osb->sb, fe_blkno);
391 OCFS2_I(inode)->ip_blkno = fe_blkno; 483 OCFS2_I(inode)->ip_blkno = fe_blkno;
392 if (S_ISDIR(mode))
393 inode->i_nlink = 2;
394 else
395 inode->i_nlink = 1;
396 inode->i_mode = mode;
397 spin_lock(&osb->osb_lock); 484 spin_lock(&osb->osb_lock);
398 inode->i_generation = osb->s_next_generation++; 485 inode->i_generation = osb->s_next_generation++;
399 spin_unlock(&osb->osb_lock); 486 spin_unlock(&osb->osb_lock);
@@ -406,8 +493,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
406 } 493 }
407 ocfs2_set_new_buffer_uptodate(inode, *new_fe_bh); 494 ocfs2_set_new_buffer_uptodate(inode, *new_fe_bh);
408 495
409 status = ocfs2_journal_access(handle, inode, *new_fe_bh, 496 status = ocfs2_journal_access_di(handle, inode, *new_fe_bh,
410 OCFS2_JOURNAL_ACCESS_CREATE); 497 OCFS2_JOURNAL_ACCESS_CREATE);
411 if (status < 0) { 498 if (status < 0) {
412 mlog_errno(status); 499 mlog_errno(status);
413 goto leave; 500 goto leave;
@@ -421,17 +508,11 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
421 fe->i_blkno = cpu_to_le64(fe_blkno); 508 fe->i_blkno = cpu_to_le64(fe_blkno);
422 fe->i_suballoc_bit = cpu_to_le16(suballoc_bit); 509 fe->i_suballoc_bit = cpu_to_le16(suballoc_bit);
423 fe->i_suballoc_slot = cpu_to_le16(inode_ac->ac_alloc_slot); 510 fe->i_suballoc_slot = cpu_to_le16(inode_ac->ac_alloc_slot);
424 fe->i_uid = cpu_to_le32(current_fsuid()); 511 fe->i_uid = cpu_to_le32(inode->i_uid);
425 if (dir->i_mode & S_ISGID) { 512 fe->i_gid = cpu_to_le32(inode->i_gid);
426 fe->i_gid = cpu_to_le32(dir->i_gid); 513 fe->i_mode = cpu_to_le16(inode->i_mode);
427 if (S_ISDIR(mode)) 514 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
428 mode |= S_ISGID;
429 } else
430 fe->i_gid = cpu_to_le32(current_fsgid());
431 fe->i_mode = cpu_to_le16(mode);
432 if (S_ISCHR(mode) || S_ISBLK(mode))
433 fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev)); 515 fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev));
434
435 fe->i_links_count = cpu_to_le16(inode->i_nlink); 516 fe->i_links_count = cpu_to_le16(inode->i_nlink);
436 517
437 fe->i_last_eb_blk = 0; 518 fe->i_last_eb_blk = 0;
@@ -446,7 +527,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
446 /* 527 /*
447 * If supported, directories start with inline data. 528 * If supported, directories start with inline data.
448 */ 529 */
449 if (S_ISDIR(mode) && ocfs2_supports_inline_data(osb)) { 530 if (S_ISDIR(inode->i_mode) && ocfs2_supports_inline_data(osb)) {
450 u16 feat = le16_to_cpu(fe->i_dyn_features); 531 u16 feat = le16_to_cpu(fe->i_dyn_features);
451 532
452 fe->i_dyn_features = cpu_to_le16(feat | OCFS2_INLINE_DATA_FL); 533 fe->i_dyn_features = cpu_to_le16(feat | OCFS2_INLINE_DATA_FL);
@@ -465,15 +546,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
465 goto leave; 546 goto leave;
466 } 547 }
467 548
468 if (ocfs2_populate_inode(inode, fe, 1) < 0) { 549 ocfs2_populate_inode(inode, fe, 1);
469 mlog(ML_ERROR, "populate inode failed! bh->b_blocknr=%llu, "
470 "i_blkno=%llu, i_ino=%lu\n",
471 (unsigned long long)(*new_fe_bh)->b_blocknr,
472 (unsigned long long)le64_to_cpu(fe->i_blkno),
473 inode->i_ino);
474 BUG();
475 }
476
477 ocfs2_inode_set_new(osb, inode); 550 ocfs2_inode_set_new(osb, inode);
478 if (!ocfs2_mount_local(osb)) { 551 if (!ocfs2_mount_local(osb)) {
479 status = ocfs2_create_new_inode_locks(inode); 552 status = ocfs2_create_new_inode_locks(inode);
@@ -484,17 +557,12 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
484 status = 0; /* error in ocfs2_create_new_inode_locks is not 557 status = 0; /* error in ocfs2_create_new_inode_locks is not
485 * critical */ 558 * critical */
486 559
487 *ret_inode = inode;
488leave: 560leave:
489 if (status < 0) { 561 if (status < 0) {
490 if (*new_fe_bh) { 562 if (*new_fe_bh) {
491 brelse(*new_fe_bh); 563 brelse(*new_fe_bh);
492 *new_fe_bh = NULL; 564 *new_fe_bh = NULL;
493 } 565 }
494 if (inode) {
495 clear_nlink(inode);
496 iput(inode);
497 }
498 } 566 }
499 567
500 mlog_exit(status); 568 mlog_exit(status);
@@ -588,7 +656,7 @@ static int ocfs2_link(struct dentry *old_dentry,
588 goto out_unlock_inode; 656 goto out_unlock_inode;
589 } 657 }
590 658
591 handle = ocfs2_start_trans(osb, OCFS2_LINK_CREDITS); 659 handle = ocfs2_start_trans(osb, ocfs2_link_credits(osb->sb));
592 if (IS_ERR(handle)) { 660 if (IS_ERR(handle)) {
593 err = PTR_ERR(handle); 661 err = PTR_ERR(handle);
594 handle = NULL; 662 handle = NULL;
@@ -596,8 +664,8 @@ static int ocfs2_link(struct dentry *old_dentry,
596 goto out_unlock_inode; 664 goto out_unlock_inode;
597 } 665 }
598 666
599 err = ocfs2_journal_access(handle, inode, fe_bh, 667 err = ocfs2_journal_access_di(handle, inode, fe_bh,
600 OCFS2_JOURNAL_ACCESS_WRITE); 668 OCFS2_JOURNAL_ACCESS_WRITE);
601 if (err < 0) { 669 if (err < 0) {
602 mlog_errno(err); 670 mlog_errno(err);
603 goto out_commit; 671 goto out_commit;
@@ -775,7 +843,7 @@ static int ocfs2_unlink(struct inode *dir,
775 } 843 }
776 } 844 }
777 845
778 handle = ocfs2_start_trans(osb, OCFS2_UNLINK_CREDITS); 846 handle = ocfs2_start_trans(osb, ocfs2_unlink_credits(osb->sb));
779 if (IS_ERR(handle)) { 847 if (IS_ERR(handle)) {
780 status = PTR_ERR(handle); 848 status = PTR_ERR(handle);
781 handle = NULL; 849 handle = NULL;
@@ -783,8 +851,8 @@ static int ocfs2_unlink(struct inode *dir,
783 goto leave; 851 goto leave;
784 } 852 }
785 853
786 status = ocfs2_journal_access(handle, inode, fe_bh, 854 status = ocfs2_journal_access_di(handle, inode, fe_bh,
787 OCFS2_JOURNAL_ACCESS_WRITE); 855 OCFS2_JOURNAL_ACCESS_WRITE);
788 if (status < 0) { 856 if (status < 0) {
789 mlog_errno(status); 857 mlog_errno(status);
790 goto leave; 858 goto leave;
@@ -1181,7 +1249,7 @@ static int ocfs2_rename(struct inode *old_dir,
1181 } 1249 }
1182 } 1250 }
1183 1251
1184 handle = ocfs2_start_trans(osb, OCFS2_RENAME_CREDITS); 1252 handle = ocfs2_start_trans(osb, ocfs2_rename_credits(osb->sb));
1185 if (IS_ERR(handle)) { 1253 if (IS_ERR(handle)) {
1186 status = PTR_ERR(handle); 1254 status = PTR_ERR(handle);
1187 handle = NULL; 1255 handle = NULL;
@@ -1197,8 +1265,8 @@ static int ocfs2_rename(struct inode *old_dir,
1197 goto bail; 1265 goto bail;
1198 } 1266 }
1199 } 1267 }
1200 status = ocfs2_journal_access(handle, new_inode, newfe_bh, 1268 status = ocfs2_journal_access_di(handle, new_inode, newfe_bh,
1201 OCFS2_JOURNAL_ACCESS_WRITE); 1269 OCFS2_JOURNAL_ACCESS_WRITE);
1202 if (status < 0) { 1270 if (status < 0) {
1203 mlog_errno(status); 1271 mlog_errno(status);
1204 goto bail; 1272 goto bail;
@@ -1244,8 +1312,8 @@ static int ocfs2_rename(struct inode *old_dir,
1244 old_inode->i_ctime = CURRENT_TIME; 1312 old_inode->i_ctime = CURRENT_TIME;
1245 mark_inode_dirty(old_inode); 1313 mark_inode_dirty(old_inode);
1246 1314
1247 status = ocfs2_journal_access(handle, old_inode, old_inode_bh, 1315 status = ocfs2_journal_access_di(handle, old_inode, old_inode_bh,
1248 OCFS2_JOURNAL_ACCESS_WRITE); 1316 OCFS2_JOURNAL_ACCESS_WRITE);
1249 if (status >= 0) { 1317 if (status >= 0) {
1250 old_di = (struct ocfs2_dinode *) old_inode_bh->b_data; 1318 old_di = (struct ocfs2_dinode *) old_inode_bh->b_data;
1251 1319
@@ -1321,9 +1389,9 @@ static int ocfs2_rename(struct inode *old_dir,
1321 (int)old_dir_nlink, old_dir->i_nlink); 1389 (int)old_dir_nlink, old_dir->i_nlink);
1322 } else { 1390 } else {
1323 struct ocfs2_dinode *fe; 1391 struct ocfs2_dinode *fe;
1324 status = ocfs2_journal_access(handle, old_dir, 1392 status = ocfs2_journal_access_di(handle, old_dir,
1325 old_dir_bh, 1393 old_dir_bh,
1326 OCFS2_JOURNAL_ACCESS_WRITE); 1394 OCFS2_JOURNAL_ACCESS_WRITE);
1327 fe = (struct ocfs2_dinode *) old_dir_bh->b_data; 1395 fe = (struct ocfs2_dinode *) old_dir_bh->b_data;
1328 fe->i_links_count = cpu_to_le16(old_dir->i_nlink); 1396 fe->i_links_count = cpu_to_le16(old_dir->i_nlink);
1329 status = ocfs2_journal_dirty(handle, old_dir_bh); 1397 status = ocfs2_journal_dirty(handle, old_dir_bh);
@@ -1496,6 +1564,13 @@ static int ocfs2_symlink(struct inode *dir,
1496 handle_t *handle = NULL; 1564 handle_t *handle = NULL;
1497 struct ocfs2_alloc_context *inode_ac = NULL; 1565 struct ocfs2_alloc_context *inode_ac = NULL;
1498 struct ocfs2_alloc_context *data_ac = NULL; 1566 struct ocfs2_alloc_context *data_ac = NULL;
1567 struct ocfs2_alloc_context *xattr_ac = NULL;
1568 int want_clusters = 0;
1569 int xattr_credits = 0;
1570 struct ocfs2_security_xattr_info si = {
1571 .enable = 1,
1572 };
1573 int did_quota = 0, did_quota_inode = 0;
1499 1574
1500 mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir, 1575 mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir,
1501 dentry, symname, dentry->d_name.len, dentry->d_name.name); 1576 dentry, symname, dentry->d_name.len, dentry->d_name.name);
@@ -1542,17 +1617,46 @@ static int ocfs2_symlink(struct inode *dir,
1542 goto bail; 1617 goto bail;
1543 } 1618 }
1544 1619
1545 /* don't reserve bitmap space for fast symlinks. */ 1620 inode = ocfs2_get_init_inode(dir, S_IFLNK | S_IRWXUGO);
1546 if (l > ocfs2_fast_symlink_chars(sb)) { 1621 if (!inode) {
1547 status = ocfs2_reserve_clusters(osb, 1, &data_ac); 1622 status = -ENOMEM;
1623 mlog_errno(status);
1624 goto bail;
1625 }
1626
1627 /* get security xattr */
1628 status = ocfs2_init_security_get(inode, dir, &si);
1629 if (status) {
1630 if (status == -EOPNOTSUPP)
1631 si.enable = 0;
1632 else {
1633 mlog_errno(status);
1634 goto bail;
1635 }
1636 }
1637
1638 /* calculate meta data/clusters for setting security xattr */
1639 if (si.enable) {
1640 status = ocfs2_calc_security_init(dir, &si, &want_clusters,
1641 &xattr_credits, &xattr_ac);
1548 if (status < 0) { 1642 if (status < 0) {
1549 if (status != -ENOSPC) 1643 mlog_errno(status);
1550 mlog_errno(status);
1551 goto bail; 1644 goto bail;
1552 } 1645 }
1553 } 1646 }
1554 1647
1555 handle = ocfs2_start_trans(osb, credits); 1648 /* don't reserve bitmap space for fast symlinks. */
1649 if (l > ocfs2_fast_symlink_chars(sb))
1650 want_clusters += 1;
1651
1652 status = ocfs2_reserve_clusters(osb, want_clusters, &data_ac);
1653 if (status < 0) {
1654 if (status != -ENOSPC)
1655 mlog_errno(status);
1656 goto bail;
1657 }
1658
1659 handle = ocfs2_start_trans(osb, credits + xattr_credits);
1556 if (IS_ERR(handle)) { 1660 if (IS_ERR(handle)) {
1557 status = PTR_ERR(handle); 1661 status = PTR_ERR(handle);
1558 handle = NULL; 1662 handle = NULL;
@@ -1560,10 +1664,18 @@ static int ocfs2_symlink(struct inode *dir,
1560 goto bail; 1664 goto bail;
1561 } 1665 }
1562 1666
1563 status = ocfs2_mknod_locked(osb, dir, dentry, 1667 /* We don't use standard VFS wrapper because we don't want vfs_dq_init
1564 S_IFLNK | S_IRWXUGO, 0, 1668 * to be called. */
1565 &new_fe_bh, parent_fe_bh, handle, 1669 if (sb_any_quota_active(osb->sb) &&
1566 &inode, inode_ac); 1670 osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) {
1671 status = -EDQUOT;
1672 goto bail;
1673 }
1674 did_quota_inode = 1;
1675
1676 status = ocfs2_mknod_locked(osb, dir, inode, dentry,
1677 0, &new_fe_bh, parent_fe_bh, handle,
1678 inode_ac);
1567 if (status < 0) { 1679 if (status < 0) {
1568 mlog_errno(status); 1680 mlog_errno(status);
1569 goto bail; 1681 goto bail;
@@ -1576,6 +1688,12 @@ static int ocfs2_symlink(struct inode *dir,
1576 u32 offset = 0; 1688 u32 offset = 0;
1577 1689
1578 inode->i_op = &ocfs2_symlink_inode_operations; 1690 inode->i_op = &ocfs2_symlink_inode_operations;
1691 if (vfs_dq_alloc_space_nodirty(inode,
1692 ocfs2_clusters_to_bytes(osb->sb, 1))) {
1693 status = -EDQUOT;
1694 goto bail;
1695 }
1696 did_quota = 1;
1579 status = ocfs2_add_inode_data(osb, inode, &offset, 1, 0, 1697 status = ocfs2_add_inode_data(osb, inode, &offset, 1, 0,
1580 new_fe_bh, 1698 new_fe_bh,
1581 handle, data_ac, NULL, 1699 handle, data_ac, NULL,
@@ -1614,6 +1732,15 @@ static int ocfs2_symlink(struct inode *dir,
1614 } 1732 }
1615 } 1733 }
1616 1734
1735 if (si.enable) {
1736 status = ocfs2_init_security_set(handle, inode, new_fe_bh, &si,
1737 xattr_ac, data_ac);
1738 if (status < 0) {
1739 mlog_errno(status);
1740 goto bail;
1741 }
1742 }
1743
1617 status = ocfs2_add_entry(handle, dentry, inode, 1744 status = ocfs2_add_entry(handle, dentry, inode,
1618 le64_to_cpu(fe->i_blkno), parent_fe_bh, 1745 le64_to_cpu(fe->i_blkno), parent_fe_bh,
1619 de_bh); 1746 de_bh);
@@ -1632,6 +1759,11 @@ static int ocfs2_symlink(struct inode *dir,
1632 dentry->d_op = &ocfs2_dentry_ops; 1759 dentry->d_op = &ocfs2_dentry_ops;
1633 d_instantiate(dentry, inode); 1760 d_instantiate(dentry, inode);
1634bail: 1761bail:
1762 if (status < 0 && did_quota)
1763 vfs_dq_free_space_nodirty(inode,
1764 ocfs2_clusters_to_bytes(osb->sb, 1));
1765 if (status < 0 && did_quota_inode)
1766 vfs_dq_free_inode(inode);
1635 if (handle) 1767 if (handle)
1636 ocfs2_commit_trans(osb, handle); 1768 ocfs2_commit_trans(osb, handle);
1637 1769
@@ -1640,12 +1772,18 @@ bail:
1640 brelse(new_fe_bh); 1772 brelse(new_fe_bh);
1641 brelse(parent_fe_bh); 1773 brelse(parent_fe_bh);
1642 brelse(de_bh); 1774 brelse(de_bh);
1775 kfree(si.name);
1776 kfree(si.value);
1643 if (inode_ac) 1777 if (inode_ac)
1644 ocfs2_free_alloc_context(inode_ac); 1778 ocfs2_free_alloc_context(inode_ac);
1645 if (data_ac) 1779 if (data_ac)
1646 ocfs2_free_alloc_context(data_ac); 1780 ocfs2_free_alloc_context(data_ac);
1647 if ((status < 0) && inode) 1781 if (xattr_ac)
1782 ocfs2_free_alloc_context(xattr_ac);
1783 if ((status < 0) && inode) {
1784 clear_nlink(inode);
1648 iput(inode); 1785 iput(inode);
1786 }
1649 1787
1650 mlog_exit(status); 1788 mlog_exit(status);
1651 1789
@@ -1754,16 +1892,14 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
1754 1892
1755 mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino); 1893 mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
1756 1894
1757 status = ocfs2_read_block(orphan_dir_inode, 1895 status = ocfs2_read_inode_block(orphan_dir_inode, &orphan_dir_bh);
1758 OCFS2_I(orphan_dir_inode)->ip_blkno,
1759 &orphan_dir_bh);
1760 if (status < 0) { 1896 if (status < 0) {
1761 mlog_errno(status); 1897 mlog_errno(status);
1762 goto leave; 1898 goto leave;
1763 } 1899 }
1764 1900
1765 status = ocfs2_journal_access(handle, orphan_dir_inode, orphan_dir_bh, 1901 status = ocfs2_journal_access_di(handle, orphan_dir_inode, orphan_dir_bh,
1766 OCFS2_JOURNAL_ACCESS_WRITE); 1902 OCFS2_JOURNAL_ACCESS_WRITE);
1767 if (status < 0) { 1903 if (status < 0) {
1768 mlog_errno(status); 1904 mlog_errno(status);
1769 goto leave; 1905 goto leave;
@@ -1850,8 +1986,8 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
1850 goto leave; 1986 goto leave;
1851 } 1987 }
1852 1988
1853 status = ocfs2_journal_access(handle,orphan_dir_inode, orphan_dir_bh, 1989 status = ocfs2_journal_access_di(handle,orphan_dir_inode, orphan_dir_bh,
1854 OCFS2_JOURNAL_ACCESS_WRITE); 1990 OCFS2_JOURNAL_ACCESS_WRITE);
1855 if (status < 0) { 1991 if (status < 0) {
1856 mlog_errno(status); 1992 mlog_errno(status);
1857 goto leave; 1993 goto leave;
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 3fed9e3d8992..077384135f4e 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -161,6 +161,7 @@ enum ocfs2_vol_state
161{ 161{
162 VOLUME_INIT = 0, 162 VOLUME_INIT = 0,
163 VOLUME_MOUNTED, 163 VOLUME_MOUNTED,
164 VOLUME_MOUNTED_QUOTAS,
164 VOLUME_DISMOUNTED, 165 VOLUME_DISMOUNTED,
165 VOLUME_DISABLED 166 VOLUME_DISABLED
166}; 167};
@@ -195,6 +196,9 @@ enum ocfs2_mount_options
195 OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */ 196 OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */
196 OCFS2_MOUNT_NOUSERXATTR = 1 << 6, /* No user xattr */ 197 OCFS2_MOUNT_NOUSERXATTR = 1 << 6, /* No user xattr */
197 OCFS2_MOUNT_INODE64 = 1 << 7, /* Allow inode numbers > 2^32 */ 198 OCFS2_MOUNT_INODE64 = 1 << 7, /* Allow inode numbers > 2^32 */
199 OCFS2_MOUNT_POSIX_ACL = 1 << 8, /* POSIX access control lists */
200 OCFS2_MOUNT_USRQUOTA = 1 << 9, /* We support user quotas */
201 OCFS2_MOUNT_GRPQUOTA = 1 << 10, /* We support group quotas */
198}; 202};
199 203
200#define OCFS2_OSB_SOFT_RO 0x0001 204#define OCFS2_OSB_SOFT_RO 0x0001
@@ -205,6 +209,8 @@ enum ocfs2_mount_options
205struct ocfs2_journal; 209struct ocfs2_journal;
206struct ocfs2_slot_info; 210struct ocfs2_slot_info;
207struct ocfs2_recovery_map; 211struct ocfs2_recovery_map;
212struct ocfs2_quota_recovery;
213struct ocfs2_dentry_lock;
208struct ocfs2_super 214struct ocfs2_super
209{ 215{
210 struct task_struct *commit_task; 216 struct task_struct *commit_task;
@@ -286,10 +292,11 @@ struct ocfs2_super
286 char *local_alloc_debug_buf; 292 char *local_alloc_debug_buf;
287#endif 293#endif
288 294
289 /* Next two fields are for local node slot recovery during 295 /* Next three fields are for local node slot recovery during
290 * mount. */ 296 * mount. */
291 int dirty; 297 int dirty;
292 struct ocfs2_dinode *local_alloc_copy; 298 struct ocfs2_dinode *local_alloc_copy;
299 struct ocfs2_quota_recovery *quota_rec;
293 300
294 struct ocfs2_alloc_stats alloc_stats; 301 struct ocfs2_alloc_stats alloc_stats;
295 char dev_str[20]; /* "major,minor" of the device */ 302 char dev_str[20]; /* "major,minor" of the device */
@@ -319,6 +326,11 @@ struct ocfs2_super
319 struct list_head blocked_lock_list; 326 struct list_head blocked_lock_list;
320 unsigned long blocked_lock_count; 327 unsigned long blocked_lock_count;
321 328
329 /* List of dentry locks to release. Anyone can add locks to
330 * the list, ocfs2_wq processes the list */
331 struct ocfs2_dentry_lock *dentry_lock_list;
332 struct work_struct dentry_lock_work;
333
322 wait_queue_head_t osb_mount_event; 334 wait_queue_head_t osb_mount_event;
323 335
324 /* Truncate log info */ 336 /* Truncate log info */
@@ -333,6 +345,10 @@ struct ocfs2_super
333 345
334#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info) 346#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info)
335 347
348/* Useful typedef for passing around journal access functions */
349typedef int (*ocfs2_journal_access_func)(handle_t *handle, struct inode *inode,
350 struct buffer_head *bh, int type);
351
336static inline int ocfs2_should_order_data(struct inode *inode) 352static inline int ocfs2_should_order_data(struct inode *inode)
337{ 353{
338 if (!S_ISREG(inode->i_mode)) 354 if (!S_ISREG(inode->i_mode))
@@ -376,6 +392,13 @@ static inline int ocfs2_supports_xattr(struct ocfs2_super *osb)
376 return 0; 392 return 0;
377} 393}
378 394
395static inline int ocfs2_meta_ecc(struct ocfs2_super *osb)
396{
397 if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_META_ECC)
398 return 1;
399 return 0;
400}
401
379/* set / clear functions because cluster events can make these happen 402/* set / clear functions because cluster events can make these happen
380 * in parallel so we want the transitions to be atomic. this also 403 * in parallel so we want the transitions to be atomic. this also
381 * means that any future flags osb_flags must be protected by spinlock 404 * means that any future flags osb_flags must be protected by spinlock
@@ -443,39 +466,19 @@ static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb)
443#define OCFS2_IS_VALID_DINODE(ptr) \ 466#define OCFS2_IS_VALID_DINODE(ptr) \
444 (!strcmp((ptr)->i_signature, OCFS2_INODE_SIGNATURE)) 467 (!strcmp((ptr)->i_signature, OCFS2_INODE_SIGNATURE))
445 468
446#define OCFS2_RO_ON_INVALID_DINODE(__sb, __di) do { \
447 typeof(__di) ____di = (__di); \
448 ocfs2_error((__sb), \
449 "Dinode # %llu has bad signature %.*s", \
450 (unsigned long long)le64_to_cpu((____di)->i_blkno), 7, \
451 (____di)->i_signature); \
452} while (0)
453
454#define OCFS2_IS_VALID_EXTENT_BLOCK(ptr) \ 469#define OCFS2_IS_VALID_EXTENT_BLOCK(ptr) \
455 (!strcmp((ptr)->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE)) 470 (!strcmp((ptr)->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE))
456 471
457#define OCFS2_RO_ON_INVALID_EXTENT_BLOCK(__sb, __eb) do { \
458 typeof(__eb) ____eb = (__eb); \
459 ocfs2_error((__sb), \
460 "Extent Block # %llu has bad signature %.*s", \
461 (unsigned long long)le64_to_cpu((____eb)->h_blkno), 7, \
462 (____eb)->h_signature); \
463} while (0)
464
465#define OCFS2_IS_VALID_GROUP_DESC(ptr) \ 472#define OCFS2_IS_VALID_GROUP_DESC(ptr) \
466 (!strcmp((ptr)->bg_signature, OCFS2_GROUP_DESC_SIGNATURE)) 473 (!strcmp((ptr)->bg_signature, OCFS2_GROUP_DESC_SIGNATURE))
467 474
468#define OCFS2_RO_ON_INVALID_GROUP_DESC(__sb, __gd) do { \
469 typeof(__gd) ____gd = (__gd); \
470 ocfs2_error((__sb), \
471 "Group Descriptor # %llu has bad signature %.*s", \
472 (unsigned long long)le64_to_cpu((____gd)->bg_blkno), 7, \
473 (____gd)->bg_signature); \
474} while (0)
475 475
476#define OCFS2_IS_VALID_XATTR_BLOCK(ptr) \ 476#define OCFS2_IS_VALID_XATTR_BLOCK(ptr) \
477 (!strcmp((ptr)->xb_signature, OCFS2_XATTR_BLOCK_SIGNATURE)) 477 (!strcmp((ptr)->xb_signature, OCFS2_XATTR_BLOCK_SIGNATURE))
478 478
479#define OCFS2_IS_VALID_DIR_TRAILER(ptr) \
480 (!strcmp((ptr)->db_signature, OCFS2_DIR_TRAILER_SIGNATURE))
481
479static inline unsigned long ino_from_blkno(struct super_block *sb, 482static inline unsigned long ino_from_blkno(struct super_block *sb,
480 u64 blkno) 483 u64 blkno)
481{ 484{
@@ -632,5 +635,6 @@ static inline s16 ocfs2_get_inode_steal_slot(struct ocfs2_super *osb)
632#define ocfs2_clear_bit ext2_clear_bit 635#define ocfs2_clear_bit ext2_clear_bit
633#define ocfs2_test_bit ext2_test_bit 636#define ocfs2_test_bit ext2_test_bit
634#define ocfs2_find_next_zero_bit ext2_find_next_zero_bit 637#define ocfs2_find_next_zero_bit ext2_find_next_zero_bit
638#define ocfs2_find_next_bit ext2_find_next_bit
635#endif /* OCFS2_H */ 639#endif /* OCFS2_H */
636 640
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 5e0c0d0aef7d..c7ae45aaa36c 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -65,6 +65,7 @@
65#define OCFS2_EXTENT_BLOCK_SIGNATURE "EXBLK01" 65#define OCFS2_EXTENT_BLOCK_SIGNATURE "EXBLK01"
66#define OCFS2_GROUP_DESC_SIGNATURE "GROUP01" 66#define OCFS2_GROUP_DESC_SIGNATURE "GROUP01"
67#define OCFS2_XATTR_BLOCK_SIGNATURE "XATTR01" 67#define OCFS2_XATTR_BLOCK_SIGNATURE "XATTR01"
68#define OCFS2_DIR_TRAILER_SIGNATURE "DIRTRL1"
68 69
69/* Compatibility flags */ 70/* Compatibility flags */
70#define OCFS2_HAS_COMPAT_FEATURE(sb,mask) \ 71#define OCFS2_HAS_COMPAT_FEATURE(sb,mask) \
@@ -93,8 +94,11 @@
93 | OCFS2_FEATURE_INCOMPAT_INLINE_DATA \ 94 | OCFS2_FEATURE_INCOMPAT_INLINE_DATA \
94 | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \ 95 | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \
95 | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \ 96 | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \
96 | OCFS2_FEATURE_INCOMPAT_XATTR) 97 | OCFS2_FEATURE_INCOMPAT_XATTR \
97#define OCFS2_FEATURE_RO_COMPAT_SUPP OCFS2_FEATURE_RO_COMPAT_UNWRITTEN 98 | OCFS2_FEATURE_INCOMPAT_META_ECC)
99#define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
100 | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
101 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
98 102
99/* 103/*
100 * Heartbeat-only devices are missing journals and other files. The 104 * Heartbeat-only devices are missing journals and other files. The
@@ -147,6 +151,9 @@
147/* Support for extended attributes */ 151/* Support for extended attributes */
148#define OCFS2_FEATURE_INCOMPAT_XATTR 0x0200 152#define OCFS2_FEATURE_INCOMPAT_XATTR 0x0200
149 153
154/* Metadata checksum and error correction */
155#define OCFS2_FEATURE_INCOMPAT_META_ECC 0x0800
156
150/* 157/*
151 * backup superblock flag is used to indicate that this volume 158 * backup superblock flag is used to indicate that this volume
152 * has backup superblocks. 159 * has backup superblocks.
@@ -163,6 +170,12 @@
163 */ 170 */
164#define OCFS2_FEATURE_RO_COMPAT_UNWRITTEN 0x0001 171#define OCFS2_FEATURE_RO_COMPAT_UNWRITTEN 0x0001
165 172
173/*
174 * Maintain quota information for this filesystem
175 */
176#define OCFS2_FEATURE_RO_COMPAT_USRQUOTA 0x0002
177#define OCFS2_FEATURE_RO_COMPAT_GRPQUOTA 0x0004
178
166/* The byte offset of the first backup block will be 1G. 179/* The byte offset of the first backup block will be 1G.
167 * The following will be 4G, 16G, 64G, 256G and 1T. 180 * The following will be 4G, 16G, 64G, 256G and 1T.
168 */ 181 */
@@ -192,6 +205,7 @@
192#define OCFS2_HEARTBEAT_FL (0x00000200) /* Heartbeat area */ 205#define OCFS2_HEARTBEAT_FL (0x00000200) /* Heartbeat area */
193#define OCFS2_CHAIN_FL (0x00000400) /* Chain allocator */ 206#define OCFS2_CHAIN_FL (0x00000400) /* Chain allocator */
194#define OCFS2_DEALLOC_FL (0x00000800) /* Truncate log */ 207#define OCFS2_DEALLOC_FL (0x00000800) /* Truncate log */
208#define OCFS2_QUOTA_FL (0x00001000) /* Quota file */
195 209
196/* 210/*
197 * Flags on ocfs2_dinode.i_dyn_features 211 * Flags on ocfs2_dinode.i_dyn_features
@@ -329,13 +343,17 @@ enum {
329#define OCFS2_FIRST_ONLINE_SYSTEM_INODE SLOT_MAP_SYSTEM_INODE 343#define OCFS2_FIRST_ONLINE_SYSTEM_INODE SLOT_MAP_SYSTEM_INODE
330 HEARTBEAT_SYSTEM_INODE, 344 HEARTBEAT_SYSTEM_INODE,
331 GLOBAL_BITMAP_SYSTEM_INODE, 345 GLOBAL_BITMAP_SYSTEM_INODE,
332#define OCFS2_LAST_GLOBAL_SYSTEM_INODE GLOBAL_BITMAP_SYSTEM_INODE 346 USER_QUOTA_SYSTEM_INODE,
347 GROUP_QUOTA_SYSTEM_INODE,
348#define OCFS2_LAST_GLOBAL_SYSTEM_INODE GROUP_QUOTA_SYSTEM_INODE
333 ORPHAN_DIR_SYSTEM_INODE, 349 ORPHAN_DIR_SYSTEM_INODE,
334 EXTENT_ALLOC_SYSTEM_INODE, 350 EXTENT_ALLOC_SYSTEM_INODE,
335 INODE_ALLOC_SYSTEM_INODE, 351 INODE_ALLOC_SYSTEM_INODE,
336 JOURNAL_SYSTEM_INODE, 352 JOURNAL_SYSTEM_INODE,
337 LOCAL_ALLOC_SYSTEM_INODE, 353 LOCAL_ALLOC_SYSTEM_INODE,
338 TRUNCATE_LOG_SYSTEM_INODE, 354 TRUNCATE_LOG_SYSTEM_INODE,
355 LOCAL_USER_QUOTA_SYSTEM_INODE,
356 LOCAL_GROUP_QUOTA_SYSTEM_INODE,
339 NUM_SYSTEM_INODES 357 NUM_SYSTEM_INODES
340}; 358};
341 359
@@ -349,6 +367,8 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
349 [SLOT_MAP_SYSTEM_INODE] = { "slot_map", 0, S_IFREG | 0644 }, 367 [SLOT_MAP_SYSTEM_INODE] = { "slot_map", 0, S_IFREG | 0644 },
350 [HEARTBEAT_SYSTEM_INODE] = { "heartbeat", OCFS2_HEARTBEAT_FL, S_IFREG | 0644 }, 368 [HEARTBEAT_SYSTEM_INODE] = { "heartbeat", OCFS2_HEARTBEAT_FL, S_IFREG | 0644 },
351 [GLOBAL_BITMAP_SYSTEM_INODE] = { "global_bitmap", 0, S_IFREG | 0644 }, 369 [GLOBAL_BITMAP_SYSTEM_INODE] = { "global_bitmap", 0, S_IFREG | 0644 },
370 [USER_QUOTA_SYSTEM_INODE] = { "aquota.user", OCFS2_QUOTA_FL, S_IFREG | 0644 },
371 [GROUP_QUOTA_SYSTEM_INODE] = { "aquota.group", OCFS2_QUOTA_FL, S_IFREG | 0644 },
352 372
353 /* Slot-specific system inodes (one copy per slot) */ 373 /* Slot-specific system inodes (one copy per slot) */
354 [ORPHAN_DIR_SYSTEM_INODE] = { "orphan_dir:%04d", 0, S_IFDIR | 0755 }, 374 [ORPHAN_DIR_SYSTEM_INODE] = { "orphan_dir:%04d", 0, S_IFDIR | 0755 },
@@ -356,7 +376,9 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
356 [INODE_ALLOC_SYSTEM_INODE] = { "inode_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 }, 376 [INODE_ALLOC_SYSTEM_INODE] = { "inode_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 },
357 [JOURNAL_SYSTEM_INODE] = { "journal:%04d", OCFS2_JOURNAL_FL, S_IFREG | 0644 }, 377 [JOURNAL_SYSTEM_INODE] = { "journal:%04d", OCFS2_JOURNAL_FL, S_IFREG | 0644 },
358 [LOCAL_ALLOC_SYSTEM_INODE] = { "local_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_LOCAL_ALLOC_FL, S_IFREG | 0644 }, 378 [LOCAL_ALLOC_SYSTEM_INODE] = { "local_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_LOCAL_ALLOC_FL, S_IFREG | 0644 },
359 [TRUNCATE_LOG_SYSTEM_INODE] = { "truncate_log:%04d", OCFS2_DEALLOC_FL, S_IFREG | 0644 } 379 [TRUNCATE_LOG_SYSTEM_INODE] = { "truncate_log:%04d", OCFS2_DEALLOC_FL, S_IFREG | 0644 },
380 [LOCAL_USER_QUOTA_SYSTEM_INODE] = { "aquota.user:%04d", OCFS2_QUOTA_FL, S_IFREG | 0644 },
381 [LOCAL_GROUP_QUOTA_SYSTEM_INODE] = { "aquota.group:%04d", OCFS2_QUOTA_FL, S_IFREG | 0644 },
360}; 382};
361 383
362/* Parameter passed from mount.ocfs2 to module */ 384/* Parameter passed from mount.ocfs2 to module */
@@ -410,6 +432,22 @@ static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = {
410#define OCFS2_RAW_SB(dinode) (&((dinode)->id2.i_super)) 432#define OCFS2_RAW_SB(dinode) (&((dinode)->id2.i_super))
411 433
412/* 434/*
435 * Block checking structure. This is used in metadata to validate the
436 * contents. If OCFS2_FEATURE_INCOMPAT_META_ECC is not set, it is all
437 * zeros.
438 */
439struct ocfs2_block_check {
440/*00*/ __le32 bc_crc32e; /* 802.3 Ethernet II CRC32 */
441 __le16 bc_ecc; /* Single-error-correction parity vector.
442 This is a simple Hamming code dependant
443 on the blocksize. OCFS2's maximum
444 blocksize, 4K, requires 16 parity bits,
445 so we fit in __le16. */
446 __le16 bc_reserved1;
447/*08*/
448};
449
450/*
413 * On disk extent record for OCFS2 451 * On disk extent record for OCFS2
414 * It describes a range of clusters on disk. 452 * It describes a range of clusters on disk.
415 * 453 *
@@ -496,7 +534,7 @@ struct ocfs2_truncate_log {
496struct ocfs2_extent_block 534struct ocfs2_extent_block
497{ 535{
498/*00*/ __u8 h_signature[8]; /* Signature for verification */ 536/*00*/ __u8 h_signature[8]; /* Signature for verification */
499 __le64 h_reserved1; 537 struct ocfs2_block_check h_check; /* Error checking */
500/*10*/ __le16 h_suballoc_slot; /* Slot suballocator this 538/*10*/ __le16 h_suballoc_slot; /* Slot suballocator this
501 extent_header belongs to */ 539 extent_header belongs to */
502 __le16 h_suballoc_bit; /* Bit offset in suballocator 540 __le16 h_suballoc_bit; /* Bit offset in suballocator
@@ -666,7 +704,8 @@ struct ocfs2_dinode {
666 was set in i_flags */ 704 was set in i_flags */
667 __le16 i_dyn_features; 705 __le16 i_dyn_features;
668 __le64 i_xattr_loc; 706 __le64 i_xattr_loc;
669/*80*/ __le64 i_reserved2[7]; 707/*80*/ struct ocfs2_block_check i_check; /* Error checking */
708/*88*/ __le64 i_reserved2[6];
670/*B8*/ union { 709/*B8*/ union {
671 __le64 i_pad1; /* Generic way to refer to this 710 __le64 i_pad1; /* Generic way to refer to this
672 64bit union */ 711 64bit union */
@@ -715,6 +754,34 @@ struct ocfs2_dir_entry {
715} __attribute__ ((packed)); 754} __attribute__ ((packed));
716 755
717/* 756/*
757 * Per-block record for the unindexed directory btree. This is carefully
758 * crafted so that the rec_len and name_len records of an ocfs2_dir_entry are
759 * mirrored. That way, the directory manipulation code needs a minimal amount
760 * of update.
761 *
762 * NOTE: Keep this structure aligned to a multiple of 4 bytes.
763 */
764struct ocfs2_dir_block_trailer {
765/*00*/ __le64 db_compat_inode; /* Always zero. Was inode */
766
767 __le16 db_compat_rec_len; /* Backwards compatible with
768 * ocfs2_dir_entry. */
769 __u8 db_compat_name_len; /* Always zero. Was name_len */
770 __u8 db_reserved0;
771 __le16 db_reserved1;
772 __le16 db_free_rec_len; /* Size of largest empty hole
773 * in this block. (unused) */
774/*10*/ __u8 db_signature[8]; /* Signature for verification */
775 __le64 db_reserved2;
776 __le64 db_free_next; /* Next block in list (unused) */
777/*20*/ __le64 db_blkno; /* Offset on disk, in blocks */
778 __le64 db_parent_dinode; /* dinode which owns me, in
779 blocks */
780/*30*/ struct ocfs2_block_check db_check; /* Error checking */
781/*40*/
782};
783
784/*
718 * On disk allocator group structure for OCFS2 785 * On disk allocator group structure for OCFS2
719 */ 786 */
720struct ocfs2_group_desc 787struct ocfs2_group_desc
@@ -733,7 +800,8 @@ struct ocfs2_group_desc
733/*20*/ __le64 bg_parent_dinode; /* dinode which owns me, in 800/*20*/ __le64 bg_parent_dinode; /* dinode which owns me, in
734 blocks */ 801 blocks */
735 __le64 bg_blkno; /* Offset on disk, in blocks */ 802 __le64 bg_blkno; /* Offset on disk, in blocks */
736/*30*/ __le64 bg_reserved2[2]; 803/*30*/ struct ocfs2_block_check bg_check; /* Error checking */
804 __le64 bg_reserved2;
737/*40*/ __u8 bg_bitmap[0]; 805/*40*/ __u8 bg_bitmap[0];
738}; 806};
739 807
@@ -776,7 +844,12 @@ struct ocfs2_xattr_header {
776 in this extent record, 844 in this extent record,
777 only valid in the first 845 only valid in the first
778 bucket. */ 846 bucket. */
779 __le64 xh_csum; 847 struct ocfs2_block_check xh_check; /* Error checking
848 (Note, this is only
849 used for xattr
850 buckets. A block uses
851 xb_check and sets
852 this field to zero.) */
780 struct ocfs2_xattr_entry xh_entries[0]; /* xattr entry list. */ 853 struct ocfs2_xattr_entry xh_entries[0]; /* xattr entry list. */
781}; 854};
782 855
@@ -827,7 +900,7 @@ struct ocfs2_xattr_block {
827 block group */ 900 block group */
828 __le32 xb_fs_generation; /* Must match super block */ 901 __le32 xb_fs_generation; /* Must match super block */
829/*10*/ __le64 xb_blkno; /* Offset on disk, in blocks */ 902/*10*/ __le64 xb_blkno; /* Offset on disk, in blocks */
830 __le64 xb_csum; 903 struct ocfs2_block_check xb_check; /* Error checking */
831/*20*/ __le16 xb_flags; /* Indicates whether this block contains 904/*20*/ __le16 xb_flags; /* Indicates whether this block contains
832 real xattr or a xattr tree. */ 905 real xattr or a xattr tree. */
833 __le16 xb_reserved0; 906 __le16 xb_reserved0;
@@ -868,6 +941,128 @@ static inline int ocfs2_xattr_get_type(struct ocfs2_xattr_entry *xe)
868 return xe->xe_type & OCFS2_XATTR_TYPE_MASK; 941 return xe->xe_type & OCFS2_XATTR_TYPE_MASK;
869} 942}
870 943
944/*
945 * On disk structures for global quota file
946 */
947
948/* Magic numbers and known versions for global quota files */
949#define OCFS2_GLOBAL_QMAGICS {\
950 0x0cf52470, /* USRQUOTA */ \
951 0x0cf52471 /* GRPQUOTA */ \
952}
953
954#define OCFS2_GLOBAL_QVERSIONS {\
955 0, \
956 0, \
957}
958
959
960/* Each block of each quota file has a certain fixed number of bytes reserved
961 * for OCFS2 internal use at its end. OCFS2 can use it for things like
962 * checksums, etc. */
963#define OCFS2_QBLK_RESERVED_SPACE 8
964
965/* Generic header of all quota files */
966struct ocfs2_disk_dqheader {
967 __le32 dqh_magic; /* Magic number identifying file */
968 __le32 dqh_version; /* Quota format version */
969};
970
971#define OCFS2_GLOBAL_INFO_OFF (sizeof(struct ocfs2_disk_dqheader))
972
973/* Information header of global quota file (immediately follows the generic
974 * header) */
975struct ocfs2_global_disk_dqinfo {
976/*00*/ __le32 dqi_bgrace; /* Grace time for space softlimit excess */
977 __le32 dqi_igrace; /* Grace time for inode softlimit excess */
978 __le32 dqi_syncms; /* Time after which we sync local changes to
979 * global quota file */
980 __le32 dqi_blocks; /* Number of blocks in quota file */
981/*10*/ __le32 dqi_free_blk; /* First free block in quota file */
982 __le32 dqi_free_entry; /* First block with free dquot entry in quota
983 * file */
984};
985
986/* Structure with global user / group information. We reserve some space
987 * for future use. */
988struct ocfs2_global_disk_dqblk {
989/*00*/ __le32 dqb_id; /* ID the structure belongs to */
990 __le32 dqb_use_count; /* Number of nodes having reference to this structure */
991 __le64 dqb_ihardlimit; /* absolute limit on allocated inodes */
992/*10*/ __le64 dqb_isoftlimit; /* preferred inode limit */
993 __le64 dqb_curinodes; /* current # allocated inodes */
994/*20*/ __le64 dqb_bhardlimit; /* absolute limit on disk space */
995 __le64 dqb_bsoftlimit; /* preferred limit on disk space */
996/*30*/ __le64 dqb_curspace; /* current space occupied */
997 __le64 dqb_btime; /* time limit for excessive disk use */
998/*40*/ __le64 dqb_itime; /* time limit for excessive inode use */
999 __le64 dqb_pad1;
1000/*50*/ __le64 dqb_pad2;
1001};
1002
1003/*
1004 * On-disk structures for local quota file
1005 */
1006
1007/* Magic numbers and known versions for local quota files */
1008#define OCFS2_LOCAL_QMAGICS {\
1009 0x0cf524c0, /* USRQUOTA */ \
1010 0x0cf524c1 /* GRPQUOTA */ \
1011}
1012
1013#define OCFS2_LOCAL_QVERSIONS {\
1014 0, \
1015 0, \
1016}
1017
1018/* Quota flags in dqinfo header */
1019#define OLQF_CLEAN 0x0001 /* Quota file is empty (this should be after\
1020 * quota has been cleanly turned off) */
1021
1022#define OCFS2_LOCAL_INFO_OFF (sizeof(struct ocfs2_disk_dqheader))
1023
1024/* Information header of local quota file (immediately follows the generic
1025 * header) */
1026struct ocfs2_local_disk_dqinfo {
1027 __le32 dqi_flags; /* Flags for quota file */
1028 __le32 dqi_chunks; /* Number of chunks of quota structures
1029 * with a bitmap */
1030 __le32 dqi_blocks; /* Number of blocks allocated for quota file */
1031};
1032
1033/* Header of one chunk of a quota file */
1034struct ocfs2_local_disk_chunk {
1035 __le32 dqc_free; /* Number of free entries in the bitmap */
1036 u8 dqc_bitmap[0]; /* Bitmap of entries in the corresponding
1037 * chunk of quota file */
1038};
1039
1040/* One entry in local quota file */
1041struct ocfs2_local_disk_dqblk {
1042/*00*/ __le64 dqb_id; /* id this quota applies to */
1043 __le64 dqb_spacemod; /* Change in the amount of used space */
1044/*10*/ __le64 dqb_inodemod; /* Change in the amount of used inodes */
1045};
1046
1047
1048/*
1049 * The quota trailer lives at the end of each quota block.
1050 */
1051
1052struct ocfs2_disk_dqtrailer {
1053/*00*/ struct ocfs2_block_check dq_check; /* Error checking */
1054/*08*/ /* Cannot be larger than OCFS2_QBLK_RESERVED_SPACE */
1055};
1056
1057static inline struct ocfs2_disk_dqtrailer *ocfs2_block_dqtrailer(int blocksize,
1058 void *buf)
1059{
1060 char *ptr = buf;
1061 ptr += blocksize - OCFS2_QBLK_RESERVED_SPACE;
1062
1063 return (struct ocfs2_disk_dqtrailer *)ptr;
1064}
1065
871#ifdef __KERNEL__ 1066#ifdef __KERNEL__
872static inline int ocfs2_fast_symlink_chars(struct super_block *sb) 1067static inline int ocfs2_fast_symlink_chars(struct super_block *sb)
873{ 1068{
diff --git a/fs/ocfs2/ocfs2_jbd_compat.h b/fs/ocfs2/ocfs2_jbd_compat.h
deleted file mode 100644
index b91c78f8f558..000000000000
--- a/fs/ocfs2/ocfs2_jbd_compat.h
+++ /dev/null
@@ -1,82 +0,0 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * ocfs2_jbd_compat.h
5 *
6 * Compatibility defines for JBD.
7 *
8 * Copyright (C) 2008 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License version 2 as published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 */
19
20#ifndef OCFS2_JBD_COMPAT_H
21#define OCFS2_JBD_COMPAT_H
22
23#ifndef CONFIG_OCFS2_COMPAT_JBD
24# error Should not have been included
25#endif
26
27struct jbd2_inode {
28 unsigned int dummy;
29};
30
31#define JBD2_BARRIER JFS_BARRIER
32#define JBD2_DEFAULT_MAX_COMMIT_AGE JBD_DEFAULT_MAX_COMMIT_AGE
33
34#define jbd2_journal_ack_err journal_ack_err
35#define jbd2_journal_clear_err journal_clear_err
36#define jbd2_journal_destroy journal_destroy
37#define jbd2_journal_dirty_metadata journal_dirty_metadata
38#define jbd2_journal_errno journal_errno
39#define jbd2_journal_extend journal_extend
40#define jbd2_journal_flush journal_flush
41#define jbd2_journal_force_commit journal_force_commit
42#define jbd2_journal_get_write_access journal_get_write_access
43#define jbd2_journal_get_undo_access journal_get_undo_access
44#define jbd2_journal_init_inode journal_init_inode
45#define jbd2_journal_invalidatepage journal_invalidatepage
46#define jbd2_journal_load journal_load
47#define jbd2_journal_lock_updates journal_lock_updates
48#define jbd2_journal_restart journal_restart
49#define jbd2_journal_start journal_start
50#define jbd2_journal_start_commit journal_start_commit
51#define jbd2_journal_stop journal_stop
52#define jbd2_journal_try_to_free_buffers journal_try_to_free_buffers
53#define jbd2_journal_unlock_updates journal_unlock_updates
54#define jbd2_journal_wipe journal_wipe
55#define jbd2_log_wait_commit log_wait_commit
56
57static inline int jbd2_journal_file_inode(handle_t *handle,
58 struct jbd2_inode *inode)
59{
60 return 0;
61}
62
63static inline int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode,
64 loff_t new_size)
65{
66 return 0;
67}
68
69static inline void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode,
70 struct inode *inode)
71{
72 return;
73}
74
75static inline void jbd2_journal_release_jbd_inode(journal_t *journal,
76 struct jbd2_inode *jinode)
77{
78 return;
79}
80
81
82#endif /* OCFS2_JBD_COMPAT_H */
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index 82c200f7a8f1..eb6f50c9ceca 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -46,6 +46,7 @@ enum ocfs2_lock_type {
46 OCFS2_LOCK_TYPE_DENTRY, 46 OCFS2_LOCK_TYPE_DENTRY,
47 OCFS2_LOCK_TYPE_OPEN, 47 OCFS2_LOCK_TYPE_OPEN,
48 OCFS2_LOCK_TYPE_FLOCK, 48 OCFS2_LOCK_TYPE_FLOCK,
49 OCFS2_LOCK_TYPE_QINFO,
49 OCFS2_NUM_LOCK_TYPES 50 OCFS2_NUM_LOCK_TYPES
50}; 51};
51 52
@@ -77,6 +78,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
77 case OCFS2_LOCK_TYPE_FLOCK: 78 case OCFS2_LOCK_TYPE_FLOCK:
78 c = 'F'; 79 c = 'F';
79 break; 80 break;
81 case OCFS2_LOCK_TYPE_QINFO:
82 c = 'Q';
83 break;
80 default: 84 default:
81 c = '\0'; 85 c = '\0';
82 } 86 }
@@ -95,6 +99,7 @@ static char *ocfs2_lock_type_strings[] = {
95 [OCFS2_LOCK_TYPE_DENTRY] = "Dentry", 99 [OCFS2_LOCK_TYPE_DENTRY] = "Dentry",
96 [OCFS2_LOCK_TYPE_OPEN] = "Open", 100 [OCFS2_LOCK_TYPE_OPEN] = "Open",
97 [OCFS2_LOCK_TYPE_FLOCK] = "Flock", 101 [OCFS2_LOCK_TYPE_FLOCK] = "Flock",
102 [OCFS2_LOCK_TYPE_QINFO] = "Quota",
98}; 103};
99 104
100static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type) 105static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
new file mode 100644
index 000000000000..7365e2e08706
--- /dev/null
+++ b/fs/ocfs2/quota.h
@@ -0,0 +1,119 @@
1/*
2 * quota.h for OCFS2
3 *
4 * On disk quota structures for local and global quota file, in-memory
5 * structures.
6 *
7 */
8
9#ifndef _OCFS2_QUOTA_H
10#define _OCFS2_QUOTA_H
11
12#include <linux/types.h>
13#include <linux/slab.h>
14#include <linux/quota.h>
15#include <linux/list.h>
16#include <linux/dqblk_qtree.h>
17
18#include "ocfs2.h"
19
20/* Common stuff */
21/* id number of quota format */
22#define QFMT_OCFS2 3
23
24/*
25 * In-memory structures
26 */
27struct ocfs2_dquot {
28 struct dquot dq_dquot; /* Generic VFS dquot */
29 loff_t dq_local_off; /* Offset in the local quota file */
30 struct ocfs2_quota_chunk *dq_chunk; /* Chunk dquot is in */
31 unsigned int dq_use_count; /* Number of nodes having reference to this entry in global quota file */
32 s64 dq_origspace; /* Last globally synced space usage */
33 s64 dq_originodes; /* Last globally synced inode usage */
34};
35
36/* Description of one chunk to recover in memory */
37struct ocfs2_recovery_chunk {
38 struct list_head rc_list; /* List of chunks */
39 int rc_chunk; /* Chunk number */
40 unsigned long *rc_bitmap; /* Bitmap of entries to recover */
41};
42
43struct ocfs2_quota_recovery {
44 struct list_head r_list[MAXQUOTAS]; /* List of chunks to recover */
45};
46
47/* In-memory structure with quota header information */
48struct ocfs2_mem_dqinfo {
49 unsigned int dqi_type; /* Quota type this structure describes */
50 unsigned int dqi_chunks; /* Number of chunks in local quota file */
51 unsigned int dqi_blocks; /* Number of blocks allocated for local quota file */
52 unsigned int dqi_syncms; /* How often should we sync with other nodes */
53 unsigned int dqi_syncjiff; /* Precomputed dqi_syncms in jiffies */
54 struct list_head dqi_chunk; /* List of chunks */
55 struct inode *dqi_gqinode; /* Global quota file inode */
56 struct ocfs2_lock_res dqi_gqlock; /* Lock protecting quota information structure */
57 struct buffer_head *dqi_gqi_bh; /* Buffer head with global quota file inode - set only if inode lock is obtained */
58 int dqi_gqi_count; /* Number of holders of dqi_gqi_bh */
59 struct buffer_head *dqi_lqi_bh; /* Buffer head with local quota file inode */
60 struct buffer_head *dqi_ibh; /* Buffer with information header */
61 struct qtree_mem_dqinfo dqi_gi; /* Info about global file */
62 struct delayed_work dqi_sync_work; /* Work for syncing dquots */
63 struct ocfs2_quota_recovery *dqi_rec; /* Pointer to recovery
64 * information, in case we
65 * enable quotas on file
66 * needing it */
67};
68
69static inline struct ocfs2_dquot *OCFS2_DQUOT(struct dquot *dquot)
70{
71 return container_of(dquot, struct ocfs2_dquot, dq_dquot);
72}
73
74struct ocfs2_quota_chunk {
75 struct list_head qc_chunk; /* List of quotafile chunks */
76 int qc_num; /* Number of quota chunk */
77 struct buffer_head *qc_headerbh; /* Buffer head with chunk header */
78};
79
80extern struct kmem_cache *ocfs2_dquot_cachep;
81extern struct kmem_cache *ocfs2_qf_chunk_cachep;
82
83extern struct qtree_fmt_operations ocfs2_global_ops;
84
85struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery(
86 struct ocfs2_super *osb, int slot_num);
87int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
88 struct ocfs2_quota_recovery *rec,
89 int slot_num);
90void ocfs2_free_quota_recovery(struct ocfs2_quota_recovery *rec);
91ssize_t ocfs2_quota_read(struct super_block *sb, int type, char *data,
92 size_t len, loff_t off);
93ssize_t ocfs2_quota_write(struct super_block *sb, int type,
94 const char *data, size_t len, loff_t off);
95int ocfs2_global_read_info(struct super_block *sb, int type);
96int ocfs2_global_write_info(struct super_block *sb, int type);
97int ocfs2_global_read_dquot(struct dquot *dquot);
98int __ocfs2_sync_dquot(struct dquot *dquot, int freeing);
99static inline int ocfs2_sync_dquot(struct dquot *dquot)
100{
101 return __ocfs2_sync_dquot(dquot, 0);
102}
103static inline int ocfs2_global_release_dquot(struct dquot *dquot)
104{
105 return __ocfs2_sync_dquot(dquot, 1);
106}
107
108int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex);
109void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex);
110int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
111 struct buffer_head **bh);
112
113extern struct dquot_operations ocfs2_quota_operations;
114extern struct quota_format_type ocfs2_quota_format;
115
116int ocfs2_quota_setup(void);
117void ocfs2_quota_shutdown(void);
118
119#endif /* _OCFS2_QUOTA_H */
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
new file mode 100644
index 000000000000..1ed0f7c86869
--- /dev/null
+++ b/fs/ocfs2/quota_global.c
@@ -0,0 +1,862 @@
1/*
2 * Implementation of operations over global quota file
3 */
4#include <linux/spinlock.h>
5#include <linux/fs.h>
6#include <linux/quota.h>
7#include <linux/quotaops.h>
8#include <linux/dqblk_qtree.h>
9#include <linux/jiffies.h>
10#include <linux/writeback.h>
11#include <linux/workqueue.h>
12
13#define MLOG_MASK_PREFIX ML_QUOTA
14#include <cluster/masklog.h>
15
16#include "ocfs2_fs.h"
17#include "ocfs2.h"
18#include "alloc.h"
19#include "blockcheck.h"
20#include "inode.h"
21#include "journal.h"
22#include "file.h"
23#include "sysfile.h"
24#include "dlmglue.h"
25#include "uptodate.h"
26#include "quota.h"
27
28static struct workqueue_struct *ocfs2_quota_wq = NULL;
29
30static void qsync_work_fn(struct work_struct *work);
31
32static void ocfs2_global_disk2memdqb(struct dquot *dquot, void *dp)
33{
34 struct ocfs2_global_disk_dqblk *d = dp;
35 struct mem_dqblk *m = &dquot->dq_dqb;
36
37 /* Update from disk only entries not set by the admin */
38 if (!test_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags)) {
39 m->dqb_ihardlimit = le64_to_cpu(d->dqb_ihardlimit);
40 m->dqb_isoftlimit = le64_to_cpu(d->dqb_isoftlimit);
41 }
42 if (!test_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags))
43 m->dqb_curinodes = le64_to_cpu(d->dqb_curinodes);
44 if (!test_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags)) {
45 m->dqb_bhardlimit = le64_to_cpu(d->dqb_bhardlimit);
46 m->dqb_bsoftlimit = le64_to_cpu(d->dqb_bsoftlimit);
47 }
48 if (!test_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags))
49 m->dqb_curspace = le64_to_cpu(d->dqb_curspace);
50 if (!test_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags))
51 m->dqb_btime = le64_to_cpu(d->dqb_btime);
52 if (!test_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags))
53 m->dqb_itime = le64_to_cpu(d->dqb_itime);
54 OCFS2_DQUOT(dquot)->dq_use_count = le32_to_cpu(d->dqb_use_count);
55}
56
57static void ocfs2_global_mem2diskdqb(void *dp, struct dquot *dquot)
58{
59 struct ocfs2_global_disk_dqblk *d = dp;
60 struct mem_dqblk *m = &dquot->dq_dqb;
61
62 d->dqb_id = cpu_to_le32(dquot->dq_id);
63 d->dqb_use_count = cpu_to_le32(OCFS2_DQUOT(dquot)->dq_use_count);
64 d->dqb_ihardlimit = cpu_to_le64(m->dqb_ihardlimit);
65 d->dqb_isoftlimit = cpu_to_le64(m->dqb_isoftlimit);
66 d->dqb_curinodes = cpu_to_le64(m->dqb_curinodes);
67 d->dqb_bhardlimit = cpu_to_le64(m->dqb_bhardlimit);
68 d->dqb_bsoftlimit = cpu_to_le64(m->dqb_bsoftlimit);
69 d->dqb_curspace = cpu_to_le64(m->dqb_curspace);
70 d->dqb_btime = cpu_to_le64(m->dqb_btime);
71 d->dqb_itime = cpu_to_le64(m->dqb_itime);
72}
73
74static int ocfs2_global_is_id(void *dp, struct dquot *dquot)
75{
76 struct ocfs2_global_disk_dqblk *d = dp;
77 struct ocfs2_mem_dqinfo *oinfo =
78 sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
79
80 if (qtree_entry_unused(&oinfo->dqi_gi, dp))
81 return 0;
82 return le32_to_cpu(d->dqb_id) == dquot->dq_id;
83}
84
85struct qtree_fmt_operations ocfs2_global_ops = {
86 .mem2disk_dqblk = ocfs2_global_mem2diskdqb,
87 .disk2mem_dqblk = ocfs2_global_disk2memdqb,
88 .is_id = ocfs2_global_is_id,
89};
90
91static int ocfs2_validate_quota_block(struct super_block *sb,
92 struct buffer_head *bh)
93{
94 struct ocfs2_disk_dqtrailer *dqt =
95 ocfs2_block_dqtrailer(sb->s_blocksize, bh->b_data);
96
97 mlog(0, "Validating quota block %llu\n",
98 (unsigned long long)bh->b_blocknr);
99
100 BUG_ON(!buffer_uptodate(bh));
101
102 /*
103 * If the ecc fails, we return the error but otherwise
104 * leave the filesystem running. We know any error is
105 * local to this block.
106 */
107 return ocfs2_validate_meta_ecc(sb, bh->b_data, &dqt->dq_check);
108}
109
110int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
111 struct buffer_head **bh)
112{
113 int rc = 0;
114 struct buffer_head *tmp = *bh;
115
116 rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, 0,
117 ocfs2_validate_quota_block);
118 if (rc)
119 mlog_errno(rc);
120
121 /* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */
122 if (!rc && !*bh)
123 *bh = tmp;
124
125 return rc;
126}
127
128static int ocfs2_get_quota_block(struct inode *inode, int block,
129 struct buffer_head **bh)
130{
131 u64 pblock, pcount;
132 int err;
133
134 down_read(&OCFS2_I(inode)->ip_alloc_sem);
135 err = ocfs2_extent_map_get_blocks(inode, block, &pblock, &pcount, NULL);
136 up_read(&OCFS2_I(inode)->ip_alloc_sem);
137 if (err) {
138 mlog_errno(err);
139 return err;
140 }
141 *bh = sb_getblk(inode->i_sb, pblock);
142 if (!*bh) {
143 err = -EIO;
144 mlog_errno(err);
145 }
146 return err;;
147}
148
149/* Read data from global quotafile - avoid pagecache and such because we cannot
150 * afford acquiring the locks... We use quota cluster lock to serialize
151 * operations. Caller is responsible for acquiring it. */
152ssize_t ocfs2_quota_read(struct super_block *sb, int type, char *data,
153 size_t len, loff_t off)
154{
155 struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
156 struct inode *gqinode = oinfo->dqi_gqinode;
157 loff_t i_size = i_size_read(gqinode);
158 int offset = off & (sb->s_blocksize - 1);
159 sector_t blk = off >> sb->s_blocksize_bits;
160 int err = 0;
161 struct buffer_head *bh;
162 size_t toread, tocopy;
163
164 if (off > i_size)
165 return 0;
166 if (off + len > i_size)
167 len = i_size - off;
168 toread = len;
169 while (toread > 0) {
170 tocopy = min_t(size_t, (sb->s_blocksize - offset), toread);
171 bh = NULL;
172 err = ocfs2_read_quota_block(gqinode, blk, &bh);
173 if (err) {
174 mlog_errno(err);
175 return err;
176 }
177 memcpy(data, bh->b_data + offset, tocopy);
178 brelse(bh);
179 offset = 0;
180 toread -= tocopy;
181 data += tocopy;
182 blk++;
183 }
184 return len;
185}
186
187/* Write to quotafile (we know the transaction is already started and has
188 * enough credits) */
189ssize_t ocfs2_quota_write(struct super_block *sb, int type,
190 const char *data, size_t len, loff_t off)
191{
192 struct mem_dqinfo *info = sb_dqinfo(sb, type);
193 struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
194 struct inode *gqinode = oinfo->dqi_gqinode;
195 int offset = off & (sb->s_blocksize - 1);
196 sector_t blk = off >> sb->s_blocksize_bits;
197 int err = 0, new = 0, ja_type;
198 struct buffer_head *bh = NULL;
199 handle_t *handle = journal_current_handle();
200
201 if (!handle) {
202 mlog(ML_ERROR, "Quota write (off=%llu, len=%llu) cancelled "
203 "because transaction was not started.\n",
204 (unsigned long long)off, (unsigned long long)len);
205 return -EIO;
206 }
207 if (len > sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE - offset) {
208 WARN_ON(1);
209 len = sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE - offset;
210 }
211
212 mutex_lock_nested(&gqinode->i_mutex, I_MUTEX_QUOTA);
213 if (gqinode->i_size < off + len) {
214 down_write(&OCFS2_I(gqinode)->ip_alloc_sem);
215 err = ocfs2_extend_no_holes(gqinode, off + len, off);
216 up_write(&OCFS2_I(gqinode)->ip_alloc_sem);
217 if (err < 0)
218 goto out;
219 err = ocfs2_simple_size_update(gqinode,
220 oinfo->dqi_gqi_bh,
221 off + len);
222 if (err < 0)
223 goto out;
224 new = 1;
225 }
226 /* Not rewriting whole block? */
227 if ((offset || len < sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE) &&
228 !new) {
229 err = ocfs2_read_quota_block(gqinode, blk, &bh);
230 ja_type = OCFS2_JOURNAL_ACCESS_WRITE;
231 } else {
232 err = ocfs2_get_quota_block(gqinode, blk, &bh);
233 ja_type = OCFS2_JOURNAL_ACCESS_CREATE;
234 }
235 if (err) {
236 mlog_errno(err);
237 return err;
238 }
239 lock_buffer(bh);
240 if (new)
241 memset(bh->b_data, 0, sb->s_blocksize);
242 memcpy(bh->b_data + offset, data, len);
243 flush_dcache_page(bh->b_page);
244 set_buffer_uptodate(bh);
245 unlock_buffer(bh);
246 ocfs2_set_buffer_uptodate(gqinode, bh);
247 err = ocfs2_journal_access_dq(handle, gqinode, bh, ja_type);
248 if (err < 0) {
249 brelse(bh);
250 goto out;
251 }
252 err = ocfs2_journal_dirty(handle, bh);
253 brelse(bh);
254 if (err < 0)
255 goto out;
256out:
257 if (err) {
258 mutex_unlock(&gqinode->i_mutex);
259 mlog_errno(err);
260 return err;
261 }
262 gqinode->i_version++;
263 ocfs2_mark_inode_dirty(handle, gqinode, oinfo->dqi_gqi_bh);
264 mutex_unlock(&gqinode->i_mutex);
265 return len;
266}
267
268int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
269{
270 int status;
271 struct buffer_head *bh = NULL;
272
273 status = ocfs2_inode_lock(oinfo->dqi_gqinode, &bh, ex);
274 if (status < 0)
275 return status;
276 spin_lock(&dq_data_lock);
277 if (!oinfo->dqi_gqi_count++)
278 oinfo->dqi_gqi_bh = bh;
279 else
280 WARN_ON(bh != oinfo->dqi_gqi_bh);
281 spin_unlock(&dq_data_lock);
282 return 0;
283}
284
285void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
286{
287 ocfs2_inode_unlock(oinfo->dqi_gqinode, ex);
288 brelse(oinfo->dqi_gqi_bh);
289 spin_lock(&dq_data_lock);
290 if (!--oinfo->dqi_gqi_count)
291 oinfo->dqi_gqi_bh = NULL;
292 spin_unlock(&dq_data_lock);
293}
294
295/* Read information header from global quota file */
296int ocfs2_global_read_info(struct super_block *sb, int type)
297{
298 struct inode *gqinode = NULL;
299 unsigned int ino[MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE,
300 GROUP_QUOTA_SYSTEM_INODE };
301 struct ocfs2_global_disk_dqinfo dinfo;
302 struct mem_dqinfo *info = sb_dqinfo(sb, type);
303 struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
304 int status;
305
306 mlog_entry_void();
307
308 /* Read global header */
309 gqinode = ocfs2_get_system_file_inode(OCFS2_SB(sb), ino[type],
310 OCFS2_INVALID_SLOT);
311 if (!gqinode) {
312 mlog(ML_ERROR, "failed to get global quota inode (type=%d)\n",
313 type);
314 status = -EINVAL;
315 goto out_err;
316 }
317 oinfo->dqi_gi.dqi_sb = sb;
318 oinfo->dqi_gi.dqi_type = type;
319 ocfs2_qinfo_lock_res_init(&oinfo->dqi_gqlock, oinfo);
320 oinfo->dqi_gi.dqi_entry_size = sizeof(struct ocfs2_global_disk_dqblk);
321 oinfo->dqi_gi.dqi_ops = &ocfs2_global_ops;
322 oinfo->dqi_gqi_bh = NULL;
323 oinfo->dqi_gqi_count = 0;
324 oinfo->dqi_gqinode = gqinode;
325 status = ocfs2_lock_global_qf(oinfo, 0);
326 if (status < 0) {
327 mlog_errno(status);
328 goto out_err;
329 }
330 status = sb->s_op->quota_read(sb, type, (char *)&dinfo,
331 sizeof(struct ocfs2_global_disk_dqinfo),
332 OCFS2_GLOBAL_INFO_OFF);
333 ocfs2_unlock_global_qf(oinfo, 0);
334 if (status != sizeof(struct ocfs2_global_disk_dqinfo)) {
335 mlog(ML_ERROR, "Cannot read global quota info (%d).\n",
336 status);
337 if (status >= 0)
338 status = -EIO;
339 mlog_errno(status);
340 goto out_err;
341 }
342 info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace);
343 info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace);
344 oinfo->dqi_syncms = le32_to_cpu(dinfo.dqi_syncms);
345 oinfo->dqi_syncjiff = msecs_to_jiffies(oinfo->dqi_syncms);
346 oinfo->dqi_gi.dqi_blocks = le32_to_cpu(dinfo.dqi_blocks);
347 oinfo->dqi_gi.dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk);
348 oinfo->dqi_gi.dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry);
349 oinfo->dqi_gi.dqi_blocksize_bits = sb->s_blocksize_bits;
350 oinfo->dqi_gi.dqi_usable_bs = sb->s_blocksize -
351 OCFS2_QBLK_RESERVED_SPACE;
352 oinfo->dqi_gi.dqi_qtree_depth = qtree_depth(&oinfo->dqi_gi);
353 INIT_DELAYED_WORK(&oinfo->dqi_sync_work, qsync_work_fn);
354 queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work,
355 oinfo->dqi_syncjiff);
356
357out_err:
358 mlog_exit(status);
359 return status;
360}
361
362/* Write information to global quota file. Expects exlusive lock on quota
363 * file inode and quota info */
364static int __ocfs2_global_write_info(struct super_block *sb, int type)
365{
366 struct mem_dqinfo *info = sb_dqinfo(sb, type);
367 struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
368 struct ocfs2_global_disk_dqinfo dinfo;
369 ssize_t size;
370
371 spin_lock(&dq_data_lock);
372 info->dqi_flags &= ~DQF_INFO_DIRTY;
373 dinfo.dqi_bgrace = cpu_to_le32(info->dqi_bgrace);
374 dinfo.dqi_igrace = cpu_to_le32(info->dqi_igrace);
375 spin_unlock(&dq_data_lock);
376 dinfo.dqi_syncms = cpu_to_le32(oinfo->dqi_syncms);
377 dinfo.dqi_blocks = cpu_to_le32(oinfo->dqi_gi.dqi_blocks);
378 dinfo.dqi_free_blk = cpu_to_le32(oinfo->dqi_gi.dqi_free_blk);
379 dinfo.dqi_free_entry = cpu_to_le32(oinfo->dqi_gi.dqi_free_entry);
380 size = sb->s_op->quota_write(sb, type, (char *)&dinfo,
381 sizeof(struct ocfs2_global_disk_dqinfo),
382 OCFS2_GLOBAL_INFO_OFF);
383 if (size != sizeof(struct ocfs2_global_disk_dqinfo)) {
384 mlog(ML_ERROR, "Cannot write global quota info structure\n");
385 if (size >= 0)
386 size = -EIO;
387 return size;
388 }
389 return 0;
390}
391
392int ocfs2_global_write_info(struct super_block *sb, int type)
393{
394 int err;
395 struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
396
397 err = ocfs2_qinfo_lock(info, 1);
398 if (err < 0)
399 return err;
400 err = __ocfs2_global_write_info(sb, type);
401 ocfs2_qinfo_unlock(info, 1);
402 return err;
403}
404
405/* Read in information from global quota file and acquire a reference to it.
406 * dquot_acquire() has already started the transaction and locked quota file */
407int ocfs2_global_read_dquot(struct dquot *dquot)
408{
409 int err, err2, ex = 0;
410 struct ocfs2_mem_dqinfo *info =
411 sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
412
413 err = ocfs2_qinfo_lock(info, 0);
414 if (err < 0)
415 goto out;
416 err = qtree_read_dquot(&info->dqi_gi, dquot);
417 if (err < 0)
418 goto out_qlock;
419 OCFS2_DQUOT(dquot)->dq_use_count++;
420 OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
421 OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes;
422 if (!dquot->dq_off) { /* No real quota entry? */
423 /* Upgrade to exclusive lock for allocation */
424 err = ocfs2_qinfo_lock(info, 1);
425 if (err < 0)
426 goto out_qlock;
427 ex = 1;
428 }
429 err = qtree_write_dquot(&info->dqi_gi, dquot);
430 if (ex && info_dirty(sb_dqinfo(dquot->dq_sb, dquot->dq_type))) {
431 err2 = __ocfs2_global_write_info(dquot->dq_sb, dquot->dq_type);
432 if (!err)
433 err = err2;
434 }
435out_qlock:
436 if (ex)
437 ocfs2_qinfo_unlock(info, 1);
438 ocfs2_qinfo_unlock(info, 0);
439out:
440 if (err < 0)
441 mlog_errno(err);
442 return err;
443}
444
445/* Sync local information about quota modifications with global quota file.
446 * Caller must have started the transaction and obtained exclusive lock for
447 * global quota file inode */
448int __ocfs2_sync_dquot(struct dquot *dquot, int freeing)
449{
450 int err, err2;
451 struct super_block *sb = dquot->dq_sb;
452 int type = dquot->dq_type;
453 struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
454 struct ocfs2_global_disk_dqblk dqblk;
455 s64 spacechange, inodechange;
456 time_t olditime, oldbtime;
457
458 err = sb->s_op->quota_read(sb, type, (char *)&dqblk,
459 sizeof(struct ocfs2_global_disk_dqblk),
460 dquot->dq_off);
461 if (err != sizeof(struct ocfs2_global_disk_dqblk)) {
462 if (err >= 0) {
463 mlog(ML_ERROR, "Short read from global quota file "
464 "(%u read)\n", err);
465 err = -EIO;
466 }
467 goto out;
468 }
469
470 /* Update space and inode usage. Get also other information from
471 * global quota file so that we don't overwrite any changes there.
472 * We are */
473 spin_lock(&dq_data_lock);
474 spacechange = dquot->dq_dqb.dqb_curspace -
475 OCFS2_DQUOT(dquot)->dq_origspace;
476 inodechange = dquot->dq_dqb.dqb_curinodes -
477 OCFS2_DQUOT(dquot)->dq_originodes;
478 olditime = dquot->dq_dqb.dqb_itime;
479 oldbtime = dquot->dq_dqb.dqb_btime;
480 ocfs2_global_disk2memdqb(dquot, &dqblk);
481 mlog(0, "Syncing global dquot %u space %lld+%lld, inodes %lld+%lld\n",
482 dquot->dq_id, dquot->dq_dqb.dqb_curspace, (long long)spacechange,
483 dquot->dq_dqb.dqb_curinodes, (long long)inodechange);
484 if (!test_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags))
485 dquot->dq_dqb.dqb_curspace += spacechange;
486 if (!test_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags))
487 dquot->dq_dqb.dqb_curinodes += inodechange;
488 /* Set properly space grace time... */
489 if (dquot->dq_dqb.dqb_bsoftlimit &&
490 dquot->dq_dqb.dqb_curspace > dquot->dq_dqb.dqb_bsoftlimit) {
491 if (!test_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags) &&
492 oldbtime > 0) {
493 if (dquot->dq_dqb.dqb_btime > 0)
494 dquot->dq_dqb.dqb_btime =
495 min(dquot->dq_dqb.dqb_btime, oldbtime);
496 else
497 dquot->dq_dqb.dqb_btime = oldbtime;
498 }
499 } else {
500 dquot->dq_dqb.dqb_btime = 0;
501 clear_bit(DQ_BLKS_B, &dquot->dq_flags);
502 }
503 /* Set properly inode grace time... */
504 if (dquot->dq_dqb.dqb_isoftlimit &&
505 dquot->dq_dqb.dqb_curinodes > dquot->dq_dqb.dqb_isoftlimit) {
506 if (!test_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags) &&
507 olditime > 0) {
508 if (dquot->dq_dqb.dqb_itime > 0)
509 dquot->dq_dqb.dqb_itime =
510 min(dquot->dq_dqb.dqb_itime, olditime);
511 else
512 dquot->dq_dqb.dqb_itime = olditime;
513 }
514 } else {
515 dquot->dq_dqb.dqb_itime = 0;
516 clear_bit(DQ_INODES_B, &dquot->dq_flags);
517 }
518 /* All information is properly updated, clear the flags */
519 __clear_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags);
520 __clear_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags);
521 __clear_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags);
522 __clear_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags);
523 __clear_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags);
524 __clear_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags);
525 OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
526 OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes;
527 spin_unlock(&dq_data_lock);
528 err = ocfs2_qinfo_lock(info, freeing);
529 if (err < 0) {
530 mlog(ML_ERROR, "Failed to lock quota info, loosing quota write"
531 " (type=%d, id=%u)\n", dquot->dq_type,
532 (unsigned)dquot->dq_id);
533 goto out;
534 }
535 if (freeing)
536 OCFS2_DQUOT(dquot)->dq_use_count--;
537 err = qtree_write_dquot(&info->dqi_gi, dquot);
538 if (err < 0)
539 goto out_qlock;
540 if (freeing && !OCFS2_DQUOT(dquot)->dq_use_count) {
541 err = qtree_release_dquot(&info->dqi_gi, dquot);
542 if (info_dirty(sb_dqinfo(sb, type))) {
543 err2 = __ocfs2_global_write_info(sb, type);
544 if (!err)
545 err = err2;
546 }
547 }
548out_qlock:
549 ocfs2_qinfo_unlock(info, freeing);
550out:
551 if (err < 0)
552 mlog_errno(err);
553 return err;
554}
555
556/*
557 * Functions for periodic syncing of dquots with global file
558 */
559static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type)
560{
561 handle_t *handle;
562 struct super_block *sb = dquot->dq_sb;
563 struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
564 struct ocfs2_super *osb = OCFS2_SB(sb);
565 int status = 0;
566
567 mlog_entry("id=%u qtype=%u type=%lu device=%s\n", dquot->dq_id,
568 dquot->dq_type, type, sb->s_id);
569 if (type != dquot->dq_type)
570 goto out;
571 status = ocfs2_lock_global_qf(oinfo, 1);
572 if (status < 0)
573 goto out;
574
575 handle = ocfs2_start_trans(osb, OCFS2_QSYNC_CREDITS);
576 if (IS_ERR(handle)) {
577 status = PTR_ERR(handle);
578 mlog_errno(status);
579 goto out_ilock;
580 }
581 mutex_lock(&sb_dqopt(sb)->dqio_mutex);
582 status = ocfs2_sync_dquot(dquot);
583 mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
584 if (status < 0)
585 mlog_errno(status);
586 /* We have to write local structure as well... */
587 dquot_mark_dquot_dirty(dquot);
588 status = dquot_commit(dquot);
589 if (status < 0)
590 mlog_errno(status);
591 ocfs2_commit_trans(osb, handle);
592out_ilock:
593 ocfs2_unlock_global_qf(oinfo, 1);
594out:
595 mlog_exit(status);
596 return status;
597}
598
599static void qsync_work_fn(struct work_struct *work)
600{
601 struct ocfs2_mem_dqinfo *oinfo = container_of(work,
602 struct ocfs2_mem_dqinfo,
603 dqi_sync_work.work);
604 struct super_block *sb = oinfo->dqi_gqinode->i_sb;
605
606 dquot_scan_active(sb, ocfs2_sync_dquot_helper, oinfo->dqi_type);
607 queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work,
608 oinfo->dqi_syncjiff);
609}
610
611/*
612 * Wrappers for generic quota functions
613 */
614
615static int ocfs2_write_dquot(struct dquot *dquot)
616{
617 handle_t *handle;
618 struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
619 int status = 0;
620
621 mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
622
623 handle = ocfs2_start_trans(osb, OCFS2_QWRITE_CREDITS);
624 if (IS_ERR(handle)) {
625 status = PTR_ERR(handle);
626 mlog_errno(status);
627 goto out;
628 }
629 status = dquot_commit(dquot);
630 ocfs2_commit_trans(osb, handle);
631out:
632 mlog_exit(status);
633 return status;
634}
635
636int ocfs2_calc_qdel_credits(struct super_block *sb, int type)
637{
638 struct ocfs2_mem_dqinfo *oinfo;
639 int features[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
640 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA };
641
642 if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, features[type]))
643 return 0;
644
645 oinfo = sb_dqinfo(sb, type)->dqi_priv;
646 /* We modify tree, leaf block, global info, local chunk header,
647 * global and local inode */
648 return oinfo->dqi_gi.dqi_qtree_depth + 2 + 1 +
649 2 * OCFS2_INODE_UPDATE_CREDITS;
650}
651
652static int ocfs2_release_dquot(struct dquot *dquot)
653{
654 handle_t *handle;
655 struct ocfs2_mem_dqinfo *oinfo =
656 sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
657 struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
658 int status = 0;
659
660 mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
661
662 status = ocfs2_lock_global_qf(oinfo, 1);
663 if (status < 0)
664 goto out;
665 handle = ocfs2_start_trans(osb,
666 ocfs2_calc_qdel_credits(dquot->dq_sb, dquot->dq_type));
667 if (IS_ERR(handle)) {
668 status = PTR_ERR(handle);
669 mlog_errno(status);
670 goto out_ilock;
671 }
672 status = dquot_release(dquot);
673 ocfs2_commit_trans(osb, handle);
674out_ilock:
675 ocfs2_unlock_global_qf(oinfo, 1);
676out:
677 mlog_exit(status);
678 return status;
679}
680
681int ocfs2_calc_qinit_credits(struct super_block *sb, int type)
682{
683 struct ocfs2_mem_dqinfo *oinfo;
684 int features[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
685 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA };
686 struct ocfs2_dinode *lfe, *gfe;
687
688 if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, features[type]))
689 return 0;
690
691 oinfo = sb_dqinfo(sb, type)->dqi_priv;
692 gfe = (struct ocfs2_dinode *)oinfo->dqi_gqi_bh->b_data;
693 lfe = (struct ocfs2_dinode *)oinfo->dqi_lqi_bh->b_data;
694 /* We can extend local file + global file. In local file we
695 * can modify info, chunk header block and dquot block. In
696 * global file we can modify info, tree and leaf block */
697 return ocfs2_calc_extend_credits(sb, &lfe->id2.i_list, 0) +
698 ocfs2_calc_extend_credits(sb, &gfe->id2.i_list, 0) +
699 3 + oinfo->dqi_gi.dqi_qtree_depth + 2;
700}
701
702static int ocfs2_acquire_dquot(struct dquot *dquot)
703{
704 handle_t *handle;
705 struct ocfs2_mem_dqinfo *oinfo =
706 sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
707 struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
708 int status = 0;
709
710 mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
711 /* We need an exclusive lock, because we're going to update use count
712 * and instantiate possibly new dquot structure */
713 status = ocfs2_lock_global_qf(oinfo, 1);
714 if (status < 0)
715 goto out;
716 handle = ocfs2_start_trans(osb,
717 ocfs2_calc_qinit_credits(dquot->dq_sb, dquot->dq_type));
718 if (IS_ERR(handle)) {
719 status = PTR_ERR(handle);
720 mlog_errno(status);
721 goto out_ilock;
722 }
723 status = dquot_acquire(dquot);
724 ocfs2_commit_trans(osb, handle);
725out_ilock:
726 ocfs2_unlock_global_qf(oinfo, 1);
727out:
728 mlog_exit(status);
729 return status;
730}
731
732static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
733{
734 unsigned long mask = (1 << (DQ_LASTSET_B + QIF_ILIMITS_B)) |
735 (1 << (DQ_LASTSET_B + QIF_BLIMITS_B)) |
736 (1 << (DQ_LASTSET_B + QIF_INODES_B)) |
737 (1 << (DQ_LASTSET_B + QIF_SPACE_B)) |
738 (1 << (DQ_LASTSET_B + QIF_BTIME_B)) |
739 (1 << (DQ_LASTSET_B + QIF_ITIME_B));
740 int sync = 0;
741 int status;
742 struct super_block *sb = dquot->dq_sb;
743 int type = dquot->dq_type;
744 struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
745 handle_t *handle;
746 struct ocfs2_super *osb = OCFS2_SB(sb);
747
748 mlog_entry("id=%u, type=%d", dquot->dq_id, type);
749 dquot_mark_dquot_dirty(dquot);
750
751 /* In case user set some limits, sync dquot immediately to global
752 * quota file so that information propagates quicker */
753 spin_lock(&dq_data_lock);
754 if (dquot->dq_flags & mask)
755 sync = 1;
756 spin_unlock(&dq_data_lock);
757 /* This is a slight hack but we can't afford getting global quota
758 * lock if we already have a transaction started. */
759 if (!sync || journal_current_handle()) {
760 status = ocfs2_write_dquot(dquot);
761 goto out;
762 }
763 status = ocfs2_lock_global_qf(oinfo, 1);
764 if (status < 0)
765 goto out;
766 handle = ocfs2_start_trans(osb, OCFS2_QSYNC_CREDITS);
767 if (IS_ERR(handle)) {
768 status = PTR_ERR(handle);
769 mlog_errno(status);
770 goto out_ilock;
771 }
772 status = ocfs2_sync_dquot(dquot);
773 if (status < 0) {
774 mlog_errno(status);
775 goto out_trans;
776 }
777 /* Now write updated local dquot structure */
778 status = dquot_commit(dquot);
779out_trans:
780 ocfs2_commit_trans(osb, handle);
781out_ilock:
782 ocfs2_unlock_global_qf(oinfo, 1);
783out:
784 mlog_exit(status);
785 return status;
786}
787
788/* This should happen only after set_dqinfo(). */
789static int ocfs2_write_info(struct super_block *sb, int type)
790{
791 handle_t *handle;
792 int status = 0;
793 struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
794
795 mlog_entry_void();
796
797 status = ocfs2_lock_global_qf(oinfo, 1);
798 if (status < 0)
799 goto out;
800 handle = ocfs2_start_trans(OCFS2_SB(sb), OCFS2_QINFO_WRITE_CREDITS);
801 if (IS_ERR(handle)) {
802 status = PTR_ERR(handle);
803 mlog_errno(status);
804 goto out_ilock;
805 }
806 status = dquot_commit_info(sb, type);
807 ocfs2_commit_trans(OCFS2_SB(sb), handle);
808out_ilock:
809 ocfs2_unlock_global_qf(oinfo, 1);
810out:
811 mlog_exit(status);
812 return status;
813}
814
815static struct dquot *ocfs2_alloc_dquot(struct super_block *sb, int type)
816{
817 struct ocfs2_dquot *dquot =
818 kmem_cache_zalloc(ocfs2_dquot_cachep, GFP_NOFS);
819
820 if (!dquot)
821 return NULL;
822 return &dquot->dq_dquot;
823}
824
825static void ocfs2_destroy_dquot(struct dquot *dquot)
826{
827 kmem_cache_free(ocfs2_dquot_cachep, dquot);
828}
829
830struct dquot_operations ocfs2_quota_operations = {
831 .initialize = dquot_initialize,
832 .drop = dquot_drop,
833 .alloc_space = dquot_alloc_space,
834 .alloc_inode = dquot_alloc_inode,
835 .free_space = dquot_free_space,
836 .free_inode = dquot_free_inode,
837 .transfer = dquot_transfer,
838 .write_dquot = ocfs2_write_dquot,
839 .acquire_dquot = ocfs2_acquire_dquot,
840 .release_dquot = ocfs2_release_dquot,
841 .mark_dirty = ocfs2_mark_dquot_dirty,
842 .write_info = ocfs2_write_info,
843 .alloc_dquot = ocfs2_alloc_dquot,
844 .destroy_dquot = ocfs2_destroy_dquot,
845};
846
847int ocfs2_quota_setup(void)
848{
849 ocfs2_quota_wq = create_workqueue("o2quot");
850 if (!ocfs2_quota_wq)
851 return -ENOMEM;
852 return 0;
853}
854
855void ocfs2_quota_shutdown(void)
856{
857 if (ocfs2_quota_wq) {
858 flush_workqueue(ocfs2_quota_wq);
859 destroy_workqueue(ocfs2_quota_wq);
860 ocfs2_quota_wq = NULL;
861 }
862}
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
new file mode 100644
index 000000000000..07deec5e9721
--- /dev/null
+++ b/fs/ocfs2/quota_local.c
@@ -0,0 +1,1253 @@
1/*
2 * Implementation of operations over local quota file
3 */
4
5#include <linux/fs.h>
6#include <linux/quota.h>
7#include <linux/quotaops.h>
8#include <linux/module.h>
9
10#define MLOG_MASK_PREFIX ML_QUOTA
11#include <cluster/masklog.h>
12
13#include "ocfs2_fs.h"
14#include "ocfs2.h"
15#include "inode.h"
16#include "alloc.h"
17#include "file.h"
18#include "buffer_head_io.h"
19#include "journal.h"
20#include "sysfile.h"
21#include "dlmglue.h"
22#include "quota.h"
23
24/* Number of local quota structures per block */
25static inline unsigned int ol_quota_entries_per_block(struct super_block *sb)
26{
27 return ((sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE) /
28 sizeof(struct ocfs2_local_disk_dqblk));
29}
30
31/* Number of blocks with entries in one chunk */
32static inline unsigned int ol_chunk_blocks(struct super_block *sb)
33{
34 return ((sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) -
35 OCFS2_QBLK_RESERVED_SPACE) << 3) /
36 ol_quota_entries_per_block(sb);
37}
38
39/* Number of entries in a chunk bitmap */
40static unsigned int ol_chunk_entries(struct super_block *sb)
41{
42 return ol_chunk_blocks(sb) * ol_quota_entries_per_block(sb);
43}
44
45/* Offset of the chunk in quota file */
46static unsigned int ol_quota_chunk_block(struct super_block *sb, int c)
47{
48 /* 1 block for local quota file info, 1 block per chunk for chunk info */
49 return 1 + (ol_chunk_blocks(sb) + 1) * c;
50}
51
52static unsigned int ol_dqblk_block(struct super_block *sb, int c, int off)
53{
54 int epb = ol_quota_entries_per_block(sb);
55
56 return ol_quota_chunk_block(sb, c) + 1 + off / epb;
57}
58
59static unsigned int ol_dqblk_block_off(struct super_block *sb, int c, int off)
60{
61 int epb = ol_quota_entries_per_block(sb);
62
63 return (off % epb) * sizeof(struct ocfs2_local_disk_dqblk);
64}
65
66/* Offset of the dquot structure in the quota file */
67static loff_t ol_dqblk_off(struct super_block *sb, int c, int off)
68{
69 return (ol_dqblk_block(sb, c, off) << sb->s_blocksize_bits) +
70 ol_dqblk_block_off(sb, c, off);
71}
72
73/* Compute block number from given offset */
74static inline unsigned int ol_dqblk_file_block(struct super_block *sb, loff_t off)
75{
76 return off >> sb->s_blocksize_bits;
77}
78
79static inline unsigned int ol_dqblk_block_offset(struct super_block *sb, loff_t off)
80{
81 return off & ((1 << sb->s_blocksize_bits) - 1);
82}
83
84/* Compute offset in the chunk of a structure with the given offset */
85static int ol_dqblk_chunk_off(struct super_block *sb, int c, loff_t off)
86{
87 int epb = ol_quota_entries_per_block(sb);
88
89 return ((off >> sb->s_blocksize_bits) -
90 ol_quota_chunk_block(sb, c) - 1) * epb
91 + ((unsigned int)(off & ((1 << sb->s_blocksize_bits) - 1))) /
92 sizeof(struct ocfs2_local_disk_dqblk);
93}
94
95/* Write bufferhead into the fs */
96static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh,
97 void (*modify)(struct buffer_head *, void *), void *private)
98{
99 struct super_block *sb = inode->i_sb;
100 handle_t *handle;
101 int status;
102
103 handle = ocfs2_start_trans(OCFS2_SB(sb), 1);
104 if (IS_ERR(handle)) {
105 status = PTR_ERR(handle);
106 mlog_errno(status);
107 return status;
108 }
109 status = ocfs2_journal_access_dq(handle, inode, bh,
110 OCFS2_JOURNAL_ACCESS_WRITE);
111 if (status < 0) {
112 mlog_errno(status);
113 ocfs2_commit_trans(OCFS2_SB(sb), handle);
114 return status;
115 }
116 lock_buffer(bh);
117 modify(bh, private);
118 unlock_buffer(bh);
119 status = ocfs2_journal_dirty(handle, bh);
120 if (status < 0) {
121 mlog_errno(status);
122 ocfs2_commit_trans(OCFS2_SB(sb), handle);
123 return status;
124 }
125 status = ocfs2_commit_trans(OCFS2_SB(sb), handle);
126 if (status < 0) {
127 mlog_errno(status);
128 return status;
129 }
130 return 0;
131}
132
133/* Check whether we understand format of quota files */
134static int ocfs2_local_check_quota_file(struct super_block *sb, int type)
135{
136 unsigned int lmagics[MAXQUOTAS] = OCFS2_LOCAL_QMAGICS;
137 unsigned int lversions[MAXQUOTAS] = OCFS2_LOCAL_QVERSIONS;
138 unsigned int gmagics[MAXQUOTAS] = OCFS2_GLOBAL_QMAGICS;
139 unsigned int gversions[MAXQUOTAS] = OCFS2_GLOBAL_QVERSIONS;
140 unsigned int ino[MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE,
141 GROUP_QUOTA_SYSTEM_INODE };
142 struct buffer_head *bh = NULL;
143 struct inode *linode = sb_dqopt(sb)->files[type];
144 struct inode *ginode = NULL;
145 struct ocfs2_disk_dqheader *dqhead;
146 int status, ret = 0;
147
148 /* First check whether we understand local quota file */
149 status = ocfs2_read_quota_block(linode, 0, &bh);
150 if (status) {
151 mlog_errno(status);
152 mlog(ML_ERROR, "failed to read quota file header (type=%d)\n",
153 type);
154 goto out_err;
155 }
156 dqhead = (struct ocfs2_disk_dqheader *)(bh->b_data);
157 if (le32_to_cpu(dqhead->dqh_magic) != lmagics[type]) {
158 mlog(ML_ERROR, "quota file magic does not match (%u != %u),"
159 " type=%d\n", le32_to_cpu(dqhead->dqh_magic),
160 lmagics[type], type);
161 goto out_err;
162 }
163 if (le32_to_cpu(dqhead->dqh_version) != lversions[type]) {
164 mlog(ML_ERROR, "quota file version does not match (%u != %u),"
165 " type=%d\n", le32_to_cpu(dqhead->dqh_version),
166 lversions[type], type);
167 goto out_err;
168 }
169 brelse(bh);
170 bh = NULL;
171
172 /* Next check whether we understand global quota file */
173 ginode = ocfs2_get_system_file_inode(OCFS2_SB(sb), ino[type],
174 OCFS2_INVALID_SLOT);
175 if (!ginode) {
176 mlog(ML_ERROR, "cannot get global quota file inode "
177 "(type=%d)\n", type);
178 goto out_err;
179 }
180 /* Since the header is read only, we don't care about locking */
181 status = ocfs2_read_quota_block(ginode, 0, &bh);
182 if (status) {
183 mlog_errno(status);
184 mlog(ML_ERROR, "failed to read global quota file header "
185 "(type=%d)\n", type);
186 goto out_err;
187 }
188 dqhead = (struct ocfs2_disk_dqheader *)(bh->b_data);
189 if (le32_to_cpu(dqhead->dqh_magic) != gmagics[type]) {
190 mlog(ML_ERROR, "global quota file magic does not match "
191 "(%u != %u), type=%d\n",
192 le32_to_cpu(dqhead->dqh_magic), gmagics[type], type);
193 goto out_err;
194 }
195 if (le32_to_cpu(dqhead->dqh_version) != gversions[type]) {
196 mlog(ML_ERROR, "global quota file version does not match "
197 "(%u != %u), type=%d\n",
198 le32_to_cpu(dqhead->dqh_version), gversions[type],
199 type);
200 goto out_err;
201 }
202
203 ret = 1;
204out_err:
205 brelse(bh);
206 iput(ginode);
207 return ret;
208}
209
210/* Release given list of quota file chunks */
211static void ocfs2_release_local_quota_bitmaps(struct list_head *head)
212{
213 struct ocfs2_quota_chunk *pos, *next;
214
215 list_for_each_entry_safe(pos, next, head, qc_chunk) {
216 list_del(&pos->qc_chunk);
217 brelse(pos->qc_headerbh);
218 kmem_cache_free(ocfs2_qf_chunk_cachep, pos);
219 }
220}
221
222/* Load quota bitmaps into memory */
223static int ocfs2_load_local_quota_bitmaps(struct inode *inode,
224 struct ocfs2_local_disk_dqinfo *ldinfo,
225 struct list_head *head)
226{
227 struct ocfs2_quota_chunk *newchunk;
228 int i, status;
229
230 INIT_LIST_HEAD(head);
231 for (i = 0; i < le32_to_cpu(ldinfo->dqi_chunks); i++) {
232 newchunk = kmem_cache_alloc(ocfs2_qf_chunk_cachep, GFP_NOFS);
233 if (!newchunk) {
234 ocfs2_release_local_quota_bitmaps(head);
235 return -ENOMEM;
236 }
237 newchunk->qc_num = i;
238 newchunk->qc_headerbh = NULL;
239 status = ocfs2_read_quota_block(inode,
240 ol_quota_chunk_block(inode->i_sb, i),
241 &newchunk->qc_headerbh);
242 if (status) {
243 mlog_errno(status);
244 kmem_cache_free(ocfs2_qf_chunk_cachep, newchunk);
245 ocfs2_release_local_quota_bitmaps(head);
246 return status;
247 }
248 list_add_tail(&newchunk->qc_chunk, head);
249 }
250 return 0;
251}
252
253static void olq_update_info(struct buffer_head *bh, void *private)
254{
255 struct mem_dqinfo *info = private;
256 struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
257 struct ocfs2_local_disk_dqinfo *ldinfo;
258
259 ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
260 OCFS2_LOCAL_INFO_OFF);
261 spin_lock(&dq_data_lock);
262 ldinfo->dqi_flags = cpu_to_le32(info->dqi_flags & DQF_MASK);
263 ldinfo->dqi_chunks = cpu_to_le32(oinfo->dqi_chunks);
264 ldinfo->dqi_blocks = cpu_to_le32(oinfo->dqi_blocks);
265 spin_unlock(&dq_data_lock);
266}
267
268static int ocfs2_add_recovery_chunk(struct super_block *sb,
269 struct ocfs2_local_disk_chunk *dchunk,
270 int chunk,
271 struct list_head *head)
272{
273 struct ocfs2_recovery_chunk *rc;
274
275 rc = kmalloc(sizeof(struct ocfs2_recovery_chunk), GFP_NOFS);
276 if (!rc)
277 return -ENOMEM;
278 rc->rc_chunk = chunk;
279 rc->rc_bitmap = kmalloc(sb->s_blocksize, GFP_NOFS);
280 if (!rc->rc_bitmap) {
281 kfree(rc);
282 return -ENOMEM;
283 }
284 memcpy(rc->rc_bitmap, dchunk->dqc_bitmap,
285 (ol_chunk_entries(sb) + 7) >> 3);
286 list_add_tail(&rc->rc_list, head);
287 return 0;
288}
289
290static void free_recovery_list(struct list_head *head)
291{
292 struct ocfs2_recovery_chunk *next;
293 struct ocfs2_recovery_chunk *rchunk;
294
295 list_for_each_entry_safe(rchunk, next, head, rc_list) {
296 list_del(&rchunk->rc_list);
297 kfree(rchunk->rc_bitmap);
298 kfree(rchunk);
299 }
300}
301
302void ocfs2_free_quota_recovery(struct ocfs2_quota_recovery *rec)
303{
304 int type;
305
306 for (type = 0; type < MAXQUOTAS; type++)
307 free_recovery_list(&(rec->r_list[type]));
308 kfree(rec);
309}
310
311/* Load entries in our quota file we have to recover*/
312static int ocfs2_recovery_load_quota(struct inode *lqinode,
313 struct ocfs2_local_disk_dqinfo *ldinfo,
314 int type,
315 struct list_head *head)
316{
317 struct super_block *sb = lqinode->i_sb;
318 struct buffer_head *hbh;
319 struct ocfs2_local_disk_chunk *dchunk;
320 int i, chunks = le32_to_cpu(ldinfo->dqi_chunks);
321 int status = 0;
322
323 for (i = 0; i < chunks; i++) {
324 hbh = NULL;
325 status = ocfs2_read_quota_block(lqinode,
326 ol_quota_chunk_block(sb, i),
327 &hbh);
328 if (status) {
329 mlog_errno(status);
330 break;
331 }
332 dchunk = (struct ocfs2_local_disk_chunk *)hbh->b_data;
333 if (le32_to_cpu(dchunk->dqc_free) < ol_chunk_entries(sb))
334 status = ocfs2_add_recovery_chunk(sb, dchunk, i, head);
335 brelse(hbh);
336 if (status < 0)
337 break;
338 }
339 if (status < 0)
340 free_recovery_list(head);
341 return status;
342}
343
344static struct ocfs2_quota_recovery *ocfs2_alloc_quota_recovery(void)
345{
346 int type;
347 struct ocfs2_quota_recovery *rec;
348
349 rec = kmalloc(sizeof(struct ocfs2_quota_recovery), GFP_NOFS);
350 if (!rec)
351 return NULL;
352 for (type = 0; type < MAXQUOTAS; type++)
353 INIT_LIST_HEAD(&(rec->r_list[type]));
354 return rec;
355}
356
357/* Load information we need for quota recovery into memory */
358struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery(
359 struct ocfs2_super *osb,
360 int slot_num)
361{
362 unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
363 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
364 unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
365 LOCAL_GROUP_QUOTA_SYSTEM_INODE };
366 struct super_block *sb = osb->sb;
367 struct ocfs2_local_disk_dqinfo *ldinfo;
368 struct inode *lqinode;
369 struct buffer_head *bh;
370 int type;
371 int status = 0;
372 struct ocfs2_quota_recovery *rec;
373
374 mlog(ML_NOTICE, "Beginning quota recovery in slot %u\n", slot_num);
375 rec = ocfs2_alloc_quota_recovery();
376 if (!rec)
377 return ERR_PTR(-ENOMEM);
378 /* First init... */
379
380 for (type = 0; type < MAXQUOTAS; type++) {
381 if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
382 continue;
383 /* At this point, journal of the slot is already replayed so
384 * we can trust metadata and data of the quota file */
385 lqinode = ocfs2_get_system_file_inode(osb, ino[type], slot_num);
386 if (!lqinode) {
387 status = -ENOENT;
388 goto out;
389 }
390 status = ocfs2_inode_lock_full(lqinode, NULL, 1,
391 OCFS2_META_LOCK_RECOVERY);
392 if (status < 0) {
393 mlog_errno(status);
394 goto out_put;
395 }
396 /* Now read local header */
397 bh = NULL;
398 status = ocfs2_read_quota_block(lqinode, 0, &bh);
399 if (status) {
400 mlog_errno(status);
401 mlog(ML_ERROR, "failed to read quota file info header "
402 "(slot=%d type=%d)\n", slot_num, type);
403 goto out_lock;
404 }
405 ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
406 OCFS2_LOCAL_INFO_OFF);
407 status = ocfs2_recovery_load_quota(lqinode, ldinfo, type,
408 &rec->r_list[type]);
409 brelse(bh);
410out_lock:
411 ocfs2_inode_unlock(lqinode, 1);
412out_put:
413 iput(lqinode);
414 if (status < 0)
415 break;
416 }
417out:
418 if (status < 0) {
419 ocfs2_free_quota_recovery(rec);
420 rec = ERR_PTR(status);
421 }
422 return rec;
423}
424
425/* Sync changes in local quota file into global quota file and
426 * reinitialize local quota file.
427 * The function expects local quota file to be already locked and
428 * dqonoff_mutex locked. */
429static int ocfs2_recover_local_quota_file(struct inode *lqinode,
430 int type,
431 struct ocfs2_quota_recovery *rec)
432{
433 struct super_block *sb = lqinode->i_sb;
434 struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
435 struct ocfs2_local_disk_chunk *dchunk;
436 struct ocfs2_local_disk_dqblk *dqblk;
437 struct dquot *dquot;
438 handle_t *handle;
439 struct buffer_head *hbh = NULL, *qbh = NULL;
440 int status = 0;
441 int bit, chunk;
442 struct ocfs2_recovery_chunk *rchunk, *next;
443 qsize_t spacechange, inodechange;
444
445 mlog_entry("ino=%lu type=%u", (unsigned long)lqinode->i_ino, type);
446
447 status = ocfs2_lock_global_qf(oinfo, 1);
448 if (status < 0)
449 goto out;
450
451 list_for_each_entry_safe(rchunk, next, &(rec->r_list[type]), rc_list) {
452 chunk = rchunk->rc_chunk;
453 hbh = NULL;
454 status = ocfs2_read_quota_block(lqinode,
455 ol_quota_chunk_block(sb, chunk),
456 &hbh);
457 if (status) {
458 mlog_errno(status);
459 break;
460 }
461 dchunk = (struct ocfs2_local_disk_chunk *)hbh->b_data;
462 for_each_bit(bit, rchunk->rc_bitmap, ol_chunk_entries(sb)) {
463 qbh = NULL;
464 status = ocfs2_read_quota_block(lqinode,
465 ol_dqblk_block(sb, chunk, bit),
466 &qbh);
467 if (status) {
468 mlog_errno(status);
469 break;
470 }
471 dqblk = (struct ocfs2_local_disk_dqblk *)(qbh->b_data +
472 ol_dqblk_block_off(sb, chunk, bit));
473 dquot = dqget(sb, le64_to_cpu(dqblk->dqb_id), type);
474 if (!dquot) {
475 status = -EIO;
476 mlog(ML_ERROR, "Failed to get quota structure "
477 "for id %u, type %d. Cannot finish quota "
478 "file recovery.\n",
479 (unsigned)le64_to_cpu(dqblk->dqb_id),
480 type);
481 goto out_put_bh;
482 }
483 handle = ocfs2_start_trans(OCFS2_SB(sb),
484 OCFS2_QSYNC_CREDITS);
485 if (IS_ERR(handle)) {
486 status = PTR_ERR(handle);
487 mlog_errno(status);
488 goto out_put_dquot;
489 }
490 mutex_lock(&sb_dqopt(sb)->dqio_mutex);
491 spin_lock(&dq_data_lock);
492 /* Add usage from quota entry into quota changes
493 * of our node. Auxiliary variables are important
494 * due to signedness */
495 spacechange = le64_to_cpu(dqblk->dqb_spacemod);
496 inodechange = le64_to_cpu(dqblk->dqb_inodemod);
497 dquot->dq_dqb.dqb_curspace += spacechange;
498 dquot->dq_dqb.dqb_curinodes += inodechange;
499 spin_unlock(&dq_data_lock);
500 /* We want to drop reference held by the crashed
501 * node. Since we have our own reference we know
502 * global structure actually won't be freed. */
503 status = ocfs2_global_release_dquot(dquot);
504 if (status < 0) {
505 mlog_errno(status);
506 goto out_commit;
507 }
508 /* Release local quota file entry */
509 status = ocfs2_journal_access_dq(handle, lqinode,
510 qbh, OCFS2_JOURNAL_ACCESS_WRITE);
511 if (status < 0) {
512 mlog_errno(status);
513 goto out_commit;
514 }
515 lock_buffer(qbh);
516 WARN_ON(!ocfs2_test_bit(bit, dchunk->dqc_bitmap));
517 ocfs2_clear_bit(bit, dchunk->dqc_bitmap);
518 le32_add_cpu(&dchunk->dqc_free, 1);
519 unlock_buffer(qbh);
520 status = ocfs2_journal_dirty(handle, qbh);
521 if (status < 0)
522 mlog_errno(status);
523out_commit:
524 mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
525 ocfs2_commit_trans(OCFS2_SB(sb), handle);
526out_put_dquot:
527 dqput(dquot);
528out_put_bh:
529 brelse(qbh);
530 if (status < 0)
531 break;
532 }
533 brelse(hbh);
534 list_del(&rchunk->rc_list);
535 kfree(rchunk->rc_bitmap);
536 kfree(rchunk);
537 if (status < 0)
538 break;
539 }
540 ocfs2_unlock_global_qf(oinfo, 1);
541out:
542 if (status < 0)
543 free_recovery_list(&(rec->r_list[type]));
544 mlog_exit(status);
545 return status;
546}
547
548/* Recover local quota files for given node different from us */
549int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
550 struct ocfs2_quota_recovery *rec,
551 int slot_num)
552{
553 unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
554 LOCAL_GROUP_QUOTA_SYSTEM_INODE };
555 struct super_block *sb = osb->sb;
556 struct ocfs2_local_disk_dqinfo *ldinfo;
557 struct buffer_head *bh;
558 handle_t *handle;
559 int type;
560 int status = 0;
561 struct inode *lqinode;
562 unsigned int flags;
563
564 mlog(ML_NOTICE, "Finishing quota recovery in slot %u\n", slot_num);
565 mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
566 for (type = 0; type < MAXQUOTAS; type++) {
567 if (list_empty(&(rec->r_list[type])))
568 continue;
569 mlog(0, "Recovering quota in slot %d\n", slot_num);
570 lqinode = ocfs2_get_system_file_inode(osb, ino[type], slot_num);
571 if (!lqinode) {
572 status = -ENOENT;
573 goto out;
574 }
575 status = ocfs2_inode_lock_full(lqinode, NULL, 1,
576 OCFS2_META_LOCK_NOQUEUE);
577 /* Someone else is holding the lock? Then he must be
578 * doing the recovery. Just skip the file... */
579 if (status == -EAGAIN) {
580 mlog(ML_NOTICE, "skipping quota recovery for slot %d "
581 "because quota file is locked.\n", slot_num);
582 status = 0;
583 goto out_put;
584 } else if (status < 0) {
585 mlog_errno(status);
586 goto out_put;
587 }
588 /* Now read local header */
589 bh = NULL;
590 status = ocfs2_read_quota_block(lqinode, 0, &bh);
591 if (status) {
592 mlog_errno(status);
593 mlog(ML_ERROR, "failed to read quota file info header "
594 "(slot=%d type=%d)\n", slot_num, type);
595 goto out_lock;
596 }
597 ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
598 OCFS2_LOCAL_INFO_OFF);
599 /* Is recovery still needed? */
600 flags = le32_to_cpu(ldinfo->dqi_flags);
601 if (!(flags & OLQF_CLEAN))
602 status = ocfs2_recover_local_quota_file(lqinode,
603 type,
604 rec);
605 /* We don't want to mark file as clean when it is actually
606 * active */
607 if (slot_num == osb->slot_num)
608 goto out_bh;
609 /* Mark quota file as clean if we are recovering quota file of
610 * some other node. */
611 handle = ocfs2_start_trans(osb, 1);
612 if (IS_ERR(handle)) {
613 status = PTR_ERR(handle);
614 mlog_errno(status);
615 goto out_bh;
616 }
617 status = ocfs2_journal_access_dq(handle, lqinode, bh,
618 OCFS2_JOURNAL_ACCESS_WRITE);
619 if (status < 0) {
620 mlog_errno(status);
621 goto out_trans;
622 }
623 lock_buffer(bh);
624 ldinfo->dqi_flags = cpu_to_le32(flags | OLQF_CLEAN);
625 unlock_buffer(bh);
626 status = ocfs2_journal_dirty(handle, bh);
627 if (status < 0)
628 mlog_errno(status);
629out_trans:
630 ocfs2_commit_trans(osb, handle);
631out_bh:
632 brelse(bh);
633out_lock:
634 ocfs2_inode_unlock(lqinode, 1);
635out_put:
636 iput(lqinode);
637 if (status < 0)
638 break;
639 }
640out:
641 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
642 kfree(rec);
643 return status;
644}
645
646/* Read information header from quota file */
647static int ocfs2_local_read_info(struct super_block *sb, int type)
648{
649 struct ocfs2_local_disk_dqinfo *ldinfo;
650 struct mem_dqinfo *info = sb_dqinfo(sb, type);
651 struct ocfs2_mem_dqinfo *oinfo;
652 struct inode *lqinode = sb_dqopt(sb)->files[type];
653 int status;
654 struct buffer_head *bh = NULL;
655 struct ocfs2_quota_recovery *rec;
656 int locked = 0;
657
658 info->dqi_maxblimit = 0x7fffffffffffffffLL;
659 info->dqi_maxilimit = 0x7fffffffffffffffLL;
660 oinfo = kmalloc(sizeof(struct ocfs2_mem_dqinfo), GFP_NOFS);
661 if (!oinfo) {
662 mlog(ML_ERROR, "failed to allocate memory for ocfs2 quota"
663 " info.");
664 goto out_err;
665 }
666 info->dqi_priv = oinfo;
667 oinfo->dqi_type = type;
668 INIT_LIST_HEAD(&oinfo->dqi_chunk);
669 oinfo->dqi_rec = NULL;
670 oinfo->dqi_lqi_bh = NULL;
671 oinfo->dqi_ibh = NULL;
672
673 status = ocfs2_global_read_info(sb, type);
674 if (status < 0)
675 goto out_err;
676
677 status = ocfs2_inode_lock(lqinode, &oinfo->dqi_lqi_bh, 1);
678 if (status < 0) {
679 mlog_errno(status);
680 goto out_err;
681 }
682 locked = 1;
683
684 /* Now read local header */
685 status = ocfs2_read_quota_block(lqinode, 0, &bh);
686 if (status) {
687 mlog_errno(status);
688 mlog(ML_ERROR, "failed to read quota file info header "
689 "(type=%d)\n", type);
690 goto out_err;
691 }
692 ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
693 OCFS2_LOCAL_INFO_OFF);
694 info->dqi_flags = le32_to_cpu(ldinfo->dqi_flags);
695 oinfo->dqi_chunks = le32_to_cpu(ldinfo->dqi_chunks);
696 oinfo->dqi_blocks = le32_to_cpu(ldinfo->dqi_blocks);
697 oinfo->dqi_ibh = bh;
698
699 /* We crashed when using local quota file? */
700 if (!(info->dqi_flags & OLQF_CLEAN)) {
701 rec = OCFS2_SB(sb)->quota_rec;
702 if (!rec) {
703 rec = ocfs2_alloc_quota_recovery();
704 if (!rec) {
705 status = -ENOMEM;
706 mlog_errno(status);
707 goto out_err;
708 }
709 OCFS2_SB(sb)->quota_rec = rec;
710 }
711
712 status = ocfs2_recovery_load_quota(lqinode, ldinfo, type,
713 &rec->r_list[type]);
714 if (status < 0) {
715 mlog_errno(status);
716 goto out_err;
717 }
718 }
719
720 status = ocfs2_load_local_quota_bitmaps(lqinode,
721 ldinfo,
722 &oinfo->dqi_chunk);
723 if (status < 0) {
724 mlog_errno(status);
725 goto out_err;
726 }
727
728 /* Now mark quota file as used */
729 info->dqi_flags &= ~OLQF_CLEAN;
730 status = ocfs2_modify_bh(lqinode, bh, olq_update_info, info);
731 if (status < 0) {
732 mlog_errno(status);
733 goto out_err;
734 }
735
736 return 0;
737out_err:
738 if (oinfo) {
739 iput(oinfo->dqi_gqinode);
740 ocfs2_simple_drop_lockres(OCFS2_SB(sb), &oinfo->dqi_gqlock);
741 ocfs2_lock_res_free(&oinfo->dqi_gqlock);
742 brelse(oinfo->dqi_lqi_bh);
743 if (locked)
744 ocfs2_inode_unlock(lqinode, 1);
745 ocfs2_release_local_quota_bitmaps(&oinfo->dqi_chunk);
746 kfree(oinfo);
747 }
748 brelse(bh);
749 return -1;
750}
751
752/* Write local info to quota file */
753static int ocfs2_local_write_info(struct super_block *sb, int type)
754{
755 struct mem_dqinfo *info = sb_dqinfo(sb, type);
756 struct buffer_head *bh = ((struct ocfs2_mem_dqinfo *)info->dqi_priv)
757 ->dqi_ibh;
758 int status;
759
760 status = ocfs2_modify_bh(sb_dqopt(sb)->files[type], bh, olq_update_info,
761 info);
762 if (status < 0) {
763 mlog_errno(status);
764 return -1;
765 }
766
767 return 0;
768}
769
770/* Release info from memory */
771static int ocfs2_local_free_info(struct super_block *sb, int type)
772{
773 struct mem_dqinfo *info = sb_dqinfo(sb, type);
774 struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
775 struct ocfs2_quota_chunk *chunk;
776 struct ocfs2_local_disk_chunk *dchunk;
777 int mark_clean = 1, len;
778 int status;
779
780 /* At this point we know there are no more dquots and thus
781 * even if there's some sync in the pdflush queue, it won't
782 * find any dquots and return without doing anything */
783 cancel_delayed_work_sync(&oinfo->dqi_sync_work);
784 iput(oinfo->dqi_gqinode);
785 ocfs2_simple_drop_lockres(OCFS2_SB(sb), &oinfo->dqi_gqlock);
786 ocfs2_lock_res_free(&oinfo->dqi_gqlock);
787 list_for_each_entry(chunk, &oinfo->dqi_chunk, qc_chunk) {
788 dchunk = (struct ocfs2_local_disk_chunk *)
789 (chunk->qc_headerbh->b_data);
790 if (chunk->qc_num < oinfo->dqi_chunks - 1) {
791 len = ol_chunk_entries(sb);
792 } else {
793 len = (oinfo->dqi_blocks -
794 ol_quota_chunk_block(sb, chunk->qc_num) - 1)
795 * ol_quota_entries_per_block(sb);
796 }
797 /* Not all entries free? Bug! */
798 if (le32_to_cpu(dchunk->dqc_free) != len) {
799 mlog(ML_ERROR, "releasing quota file with used "
800 "entries (type=%d)\n", type);
801 mark_clean = 0;
802 }
803 }
804 ocfs2_release_local_quota_bitmaps(&oinfo->dqi_chunk);
805
806 /* dqonoff_mutex protects us against racing with recovery thread... */
807 if (oinfo->dqi_rec) {
808 ocfs2_free_quota_recovery(oinfo->dqi_rec);
809 mark_clean = 0;
810 }
811
812 if (!mark_clean)
813 goto out;
814
815 /* Mark local file as clean */
816 info->dqi_flags |= OLQF_CLEAN;
817 status = ocfs2_modify_bh(sb_dqopt(sb)->files[type],
818 oinfo->dqi_ibh,
819 olq_update_info,
820 info);
821 if (status < 0) {
822 mlog_errno(status);
823 goto out;
824 }
825
826out:
827 ocfs2_inode_unlock(sb_dqopt(sb)->files[type], 1);
828 brelse(oinfo->dqi_ibh);
829 brelse(oinfo->dqi_lqi_bh);
830 kfree(oinfo);
831 return 0;
832}
833
834static void olq_set_dquot(struct buffer_head *bh, void *private)
835{
836 struct ocfs2_dquot *od = private;
837 struct ocfs2_local_disk_dqblk *dqblk;
838 struct super_block *sb = od->dq_dquot.dq_sb;
839
840 dqblk = (struct ocfs2_local_disk_dqblk *)(bh->b_data
841 + ol_dqblk_block_offset(sb, od->dq_local_off));
842
843 dqblk->dqb_id = cpu_to_le64(od->dq_dquot.dq_id);
844 spin_lock(&dq_data_lock);
845 dqblk->dqb_spacemod = cpu_to_le64(od->dq_dquot.dq_dqb.dqb_curspace -
846 od->dq_origspace);
847 dqblk->dqb_inodemod = cpu_to_le64(od->dq_dquot.dq_dqb.dqb_curinodes -
848 od->dq_originodes);
849 spin_unlock(&dq_data_lock);
850 mlog(0, "Writing local dquot %u space %lld inodes %lld\n",
851 od->dq_dquot.dq_id, (long long)le64_to_cpu(dqblk->dqb_spacemod),
852 (long long)le64_to_cpu(dqblk->dqb_inodemod));
853}
854
855/* Write dquot to local quota file */
856static int ocfs2_local_write_dquot(struct dquot *dquot)
857{
858 struct super_block *sb = dquot->dq_sb;
859 struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
860 struct buffer_head *bh = NULL;
861 int status;
862
863 status = ocfs2_read_quota_block(sb_dqopt(sb)->files[dquot->dq_type],
864 ol_dqblk_file_block(sb, od->dq_local_off),
865 &bh);
866 if (status) {
867 mlog_errno(status);
868 goto out;
869 }
870 status = ocfs2_modify_bh(sb_dqopt(sb)->files[dquot->dq_type], bh,
871 olq_set_dquot, od);
872 if (status < 0) {
873 mlog_errno(status);
874 goto out;
875 }
876out:
877 brelse(bh);
878 return status;
879}
880
881/* Find free entry in local quota file */
882static struct ocfs2_quota_chunk *ocfs2_find_free_entry(struct super_block *sb,
883 int type,
884 int *offset)
885{
886 struct mem_dqinfo *info = sb_dqinfo(sb, type);
887 struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
888 struct ocfs2_quota_chunk *chunk;
889 struct ocfs2_local_disk_chunk *dchunk;
890 int found = 0, len;
891
892 list_for_each_entry(chunk, &oinfo->dqi_chunk, qc_chunk) {
893 dchunk = (struct ocfs2_local_disk_chunk *)
894 chunk->qc_headerbh->b_data;
895 if (le32_to_cpu(dchunk->dqc_free) > 0) {
896 found = 1;
897 break;
898 }
899 }
900 if (!found)
901 return NULL;
902
903 if (chunk->qc_num < oinfo->dqi_chunks - 1) {
904 len = ol_chunk_entries(sb);
905 } else {
906 len = (oinfo->dqi_blocks -
907 ol_quota_chunk_block(sb, chunk->qc_num) - 1)
908 * ol_quota_entries_per_block(sb);
909 }
910
911 found = ocfs2_find_next_zero_bit(dchunk->dqc_bitmap, len, 0);
912 /* We failed? */
913 if (found == len) {
914 mlog(ML_ERROR, "Did not find empty entry in chunk %d with %u"
915 " entries free (type=%d)\n", chunk->qc_num,
916 le32_to_cpu(dchunk->dqc_free), type);
917 return ERR_PTR(-EIO);
918 }
919 *offset = found;
920 return chunk;
921}
922
923/* Add new chunk to the local quota file */
924static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
925 struct super_block *sb,
926 int type,
927 int *offset)
928{
929 struct mem_dqinfo *info = sb_dqinfo(sb, type);
930 struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
931 struct inode *lqinode = sb_dqopt(sb)->files[type];
932 struct ocfs2_quota_chunk *chunk = NULL;
933 struct ocfs2_local_disk_chunk *dchunk;
934 int status;
935 handle_t *handle;
936 struct buffer_head *bh = NULL;
937 u64 p_blkno;
938
939 /* We are protected by dqio_sem so no locking needed */
940 status = ocfs2_extend_no_holes(lqinode,
941 lqinode->i_size + 2 * sb->s_blocksize,
942 lqinode->i_size);
943 if (status < 0) {
944 mlog_errno(status);
945 goto out;
946 }
947 status = ocfs2_simple_size_update(lqinode, oinfo->dqi_lqi_bh,
948 lqinode->i_size + 2 * sb->s_blocksize);
949 if (status < 0) {
950 mlog_errno(status);
951 goto out;
952 }
953
954 chunk = kmem_cache_alloc(ocfs2_qf_chunk_cachep, GFP_NOFS);
955 if (!chunk) {
956 status = -ENOMEM;
957 mlog_errno(status);
958 goto out;
959 }
960
961 down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
962 status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks,
963 &p_blkno, NULL, NULL);
964 up_read(&OCFS2_I(lqinode)->ip_alloc_sem);
965 if (status < 0) {
966 mlog_errno(status);
967 goto out;
968 }
969 bh = sb_getblk(sb, p_blkno);
970 if (!bh) {
971 status = -ENOMEM;
972 mlog_errno(status);
973 goto out;
974 }
975 dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data;
976
977 handle = ocfs2_start_trans(OCFS2_SB(sb), 2);
978 if (IS_ERR(handle)) {
979 status = PTR_ERR(handle);
980 mlog_errno(status);
981 goto out;
982 }
983
984 status = ocfs2_journal_access_dq(handle, lqinode, bh,
985 OCFS2_JOURNAL_ACCESS_WRITE);
986 if (status < 0) {
987 mlog_errno(status);
988 goto out_trans;
989 }
990 lock_buffer(bh);
991 dchunk->dqc_free = cpu_to_le32(ol_quota_entries_per_block(sb));
992 memset(dchunk->dqc_bitmap, 0,
993 sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) -
994 OCFS2_QBLK_RESERVED_SPACE);
995 set_buffer_uptodate(bh);
996 unlock_buffer(bh);
997 status = ocfs2_journal_dirty(handle, bh);
998 if (status < 0) {
999 mlog_errno(status);
1000 goto out_trans;
1001 }
1002
1003 oinfo->dqi_blocks += 2;
1004 oinfo->dqi_chunks++;
1005 status = ocfs2_local_write_info(sb, type);
1006 if (status < 0) {
1007 mlog_errno(status);
1008 goto out_trans;
1009 }
1010 status = ocfs2_commit_trans(OCFS2_SB(sb), handle);
1011 if (status < 0) {
1012 mlog_errno(status);
1013 goto out;
1014 }
1015
1016 list_add_tail(&chunk->qc_chunk, &oinfo->dqi_chunk);
1017 chunk->qc_num = list_entry(chunk->qc_chunk.prev,
1018 struct ocfs2_quota_chunk,
1019 qc_chunk)->qc_num + 1;
1020 chunk->qc_headerbh = bh;
1021 *offset = 0;
1022 return chunk;
1023out_trans:
1024 ocfs2_commit_trans(OCFS2_SB(sb), handle);
1025out:
1026 brelse(bh);
1027 kmem_cache_free(ocfs2_qf_chunk_cachep, chunk);
1028 return ERR_PTR(status);
1029}
1030
1031/* Find free entry in local quota file */
1032static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
1033 struct super_block *sb,
1034 int type,
1035 int *offset)
1036{
1037 struct mem_dqinfo *info = sb_dqinfo(sb, type);
1038 struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
1039 struct ocfs2_quota_chunk *chunk;
1040 struct inode *lqinode = sb_dqopt(sb)->files[type];
1041 struct ocfs2_local_disk_chunk *dchunk;
1042 int epb = ol_quota_entries_per_block(sb);
1043 unsigned int chunk_blocks;
1044 int status;
1045 handle_t *handle;
1046
1047 if (list_empty(&oinfo->dqi_chunk))
1048 return ocfs2_local_quota_add_chunk(sb, type, offset);
1049 /* Is the last chunk full? */
1050 chunk = list_entry(oinfo->dqi_chunk.prev,
1051 struct ocfs2_quota_chunk, qc_chunk);
1052 chunk_blocks = oinfo->dqi_blocks -
1053 ol_quota_chunk_block(sb, chunk->qc_num) - 1;
1054 if (ol_chunk_blocks(sb) == chunk_blocks)
1055 return ocfs2_local_quota_add_chunk(sb, type, offset);
1056
1057 /* We are protected by dqio_sem so no locking needed */
1058 status = ocfs2_extend_no_holes(lqinode,
1059 lqinode->i_size + sb->s_blocksize,
1060 lqinode->i_size);
1061 if (status < 0) {
1062 mlog_errno(status);
1063 goto out;
1064 }
1065 status = ocfs2_simple_size_update(lqinode, oinfo->dqi_lqi_bh,
1066 lqinode->i_size + sb->s_blocksize);
1067 if (status < 0) {
1068 mlog_errno(status);
1069 goto out;
1070 }
1071 handle = ocfs2_start_trans(OCFS2_SB(sb), 2);
1072 if (IS_ERR(handle)) {
1073 status = PTR_ERR(handle);
1074 mlog_errno(status);
1075 goto out;
1076 }
1077 status = ocfs2_journal_access_dq(handle, lqinode, chunk->qc_headerbh,
1078 OCFS2_JOURNAL_ACCESS_WRITE);
1079 if (status < 0) {
1080 mlog_errno(status);
1081 goto out_trans;
1082 }
1083
1084 dchunk = (struct ocfs2_local_disk_chunk *)chunk->qc_headerbh->b_data;
1085 lock_buffer(chunk->qc_headerbh);
1086 le32_add_cpu(&dchunk->dqc_free, ol_quota_entries_per_block(sb));
1087 unlock_buffer(chunk->qc_headerbh);
1088 status = ocfs2_journal_dirty(handle, chunk->qc_headerbh);
1089 if (status < 0) {
1090 mlog_errno(status);
1091 goto out_trans;
1092 }
1093 oinfo->dqi_blocks++;
1094 status = ocfs2_local_write_info(sb, type);
1095 if (status < 0) {
1096 mlog_errno(status);
1097 goto out_trans;
1098 }
1099
1100 status = ocfs2_commit_trans(OCFS2_SB(sb), handle);
1101 if (status < 0) {
1102 mlog_errno(status);
1103 goto out;
1104 }
1105 *offset = chunk_blocks * epb;
1106 return chunk;
1107out_trans:
1108 ocfs2_commit_trans(OCFS2_SB(sb), handle);
1109out:
1110 return ERR_PTR(status);
1111}
1112
1113static void olq_alloc_dquot(struct buffer_head *bh, void *private)
1114{
1115 int *offset = private;
1116 struct ocfs2_local_disk_chunk *dchunk;
1117
1118 dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data;
1119 ocfs2_set_bit(*offset, dchunk->dqc_bitmap);
1120 le32_add_cpu(&dchunk->dqc_free, -1);
1121}
1122
1123/* Create dquot in the local file for given id */
1124static int ocfs2_create_local_dquot(struct dquot *dquot)
1125{
1126 struct super_block *sb = dquot->dq_sb;
1127 int type = dquot->dq_type;
1128 struct inode *lqinode = sb_dqopt(sb)->files[type];
1129 struct ocfs2_quota_chunk *chunk;
1130 struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
1131 int offset;
1132 int status;
1133
1134 chunk = ocfs2_find_free_entry(sb, type, &offset);
1135 if (!chunk) {
1136 chunk = ocfs2_extend_local_quota_file(sb, type, &offset);
1137 if (IS_ERR(chunk))
1138 return PTR_ERR(chunk);
1139 } else if (IS_ERR(chunk)) {
1140 return PTR_ERR(chunk);
1141 }
1142 od->dq_local_off = ol_dqblk_off(sb, chunk->qc_num, offset);
1143 od->dq_chunk = chunk;
1144
1145 /* Initialize dquot structure on disk */
1146 status = ocfs2_local_write_dquot(dquot);
1147 if (status < 0) {
1148 mlog_errno(status);
1149 goto out;
1150 }
1151
1152 /* Mark structure as allocated */
1153 status = ocfs2_modify_bh(lqinode, chunk->qc_headerbh, olq_alloc_dquot,
1154 &offset);
1155 if (status < 0) {
1156 mlog_errno(status);
1157 goto out;
1158 }
1159out:
1160 return status;
1161}
1162
1163/* Create entry in local file for dquot, load data from the global file */
1164static int ocfs2_local_read_dquot(struct dquot *dquot)
1165{
1166 int status;
1167
1168 mlog_entry("id=%u, type=%d\n", dquot->dq_id, dquot->dq_type);
1169
1170 status = ocfs2_global_read_dquot(dquot);
1171 if (status < 0) {
1172 mlog_errno(status);
1173 goto out_err;
1174 }
1175
1176 /* Now create entry in the local quota file */
1177 status = ocfs2_create_local_dquot(dquot);
1178 if (status < 0) {
1179 mlog_errno(status);
1180 goto out_err;
1181 }
1182 mlog_exit(0);
1183 return 0;
1184out_err:
1185 mlog_exit(status);
1186 return status;
1187}
1188
1189/* Release dquot structure from local quota file. ocfs2_release_dquot() has
1190 * already started a transaction and obtained exclusive lock for global
1191 * quota file. */
1192static int ocfs2_local_release_dquot(struct dquot *dquot)
1193{
1194 int status;
1195 int type = dquot->dq_type;
1196 struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
1197 struct super_block *sb = dquot->dq_sb;
1198 struct ocfs2_local_disk_chunk *dchunk;
1199 int offset;
1200 handle_t *handle = journal_current_handle();
1201
1202 BUG_ON(!handle);
1203 /* First write all local changes to global file */
1204 status = ocfs2_global_release_dquot(dquot);
1205 if (status < 0) {
1206 mlog_errno(status);
1207 goto out;
1208 }
1209
1210 status = ocfs2_journal_access_dq(handle, sb_dqopt(sb)->files[type],
1211 od->dq_chunk->qc_headerbh, OCFS2_JOURNAL_ACCESS_WRITE);
1212 if (status < 0) {
1213 mlog_errno(status);
1214 goto out;
1215 }
1216 offset = ol_dqblk_chunk_off(sb, od->dq_chunk->qc_num,
1217 od->dq_local_off);
1218 dchunk = (struct ocfs2_local_disk_chunk *)
1219 (od->dq_chunk->qc_headerbh->b_data);
1220 /* Mark structure as freed */
1221 lock_buffer(od->dq_chunk->qc_headerbh);
1222 ocfs2_clear_bit(offset, dchunk->dqc_bitmap);
1223 le32_add_cpu(&dchunk->dqc_free, 1);
1224 unlock_buffer(od->dq_chunk->qc_headerbh);
1225 status = ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh);
1226 if (status < 0) {
1227 mlog_errno(status);
1228 goto out;
1229 }
1230 status = 0;
1231out:
1232 /* Clear the read bit so that next time someone uses this
1233 * dquot he reads fresh info from disk and allocates local
1234 * dquot structure */
1235 clear_bit(DQ_READ_B, &dquot->dq_flags);
1236 return status;
1237}
1238
1239static struct quota_format_ops ocfs2_format_ops = {
1240 .check_quota_file = ocfs2_local_check_quota_file,
1241 .read_file_info = ocfs2_local_read_info,
1242 .write_file_info = ocfs2_global_write_info,
1243 .free_file_info = ocfs2_local_free_info,
1244 .read_dqblk = ocfs2_local_read_dquot,
1245 .commit_dqblk = ocfs2_local_write_dquot,
1246 .release_dqblk = ocfs2_local_release_dquot,
1247};
1248
1249struct quota_format_type ocfs2_quota_format = {
1250 .qf_fmt_id = QFMT_OCFS2,
1251 .qf_ops = &ocfs2_format_ops,
1252 .qf_owner = THIS_MODULE
1253};
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index ffd48db229a7..424adaa5f900 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -106,8 +106,8 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
106 mlog_entry("(new_clusters=%d, first_new_cluster = %u)\n", 106 mlog_entry("(new_clusters=%d, first_new_cluster = %u)\n",
107 new_clusters, first_new_cluster); 107 new_clusters, first_new_cluster);
108 108
109 ret = ocfs2_journal_access(handle, bm_inode, group_bh, 109 ret = ocfs2_journal_access_gd(handle, bm_inode, group_bh,
110 OCFS2_JOURNAL_ACCESS_WRITE); 110 OCFS2_JOURNAL_ACCESS_WRITE);
111 if (ret < 0) { 111 if (ret < 0) {
112 mlog_errno(ret); 112 mlog_errno(ret);
113 goto out; 113 goto out;
@@ -141,8 +141,8 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
141 } 141 }
142 142
143 /* update the inode accordingly. */ 143 /* update the inode accordingly. */
144 ret = ocfs2_journal_access(handle, bm_inode, bm_bh, 144 ret = ocfs2_journal_access_di(handle, bm_inode, bm_bh,
145 OCFS2_JOURNAL_ACCESS_WRITE); 145 OCFS2_JOURNAL_ACCESS_WRITE);
146 if (ret < 0) { 146 if (ret < 0) {
147 mlog_errno(ret); 147 mlog_errno(ret);
148 goto out_rollback; 148 goto out_rollback;
@@ -314,6 +314,10 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters)
314 314
315 fe = (struct ocfs2_dinode *)main_bm_bh->b_data; 315 fe = (struct ocfs2_dinode *)main_bm_bh->b_data;
316 316
317 /* main_bm_bh is validated by inode read inside ocfs2_inode_lock(),
318 * so any corruption is a code bug. */
319 BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
320
317 if (le16_to_cpu(fe->id2.i_chain.cl_cpg) != 321 if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
318 ocfs2_group_bitmap_size(osb->sb) * 8) { 322 ocfs2_group_bitmap_size(osb->sb) * 8) {
319 mlog(ML_ERROR, "The disk is too old and small. " 323 mlog(ML_ERROR, "The disk is too old and small. "
@@ -322,30 +326,18 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters)
322 goto out_unlock; 326 goto out_unlock;
323 } 327 }
324 328
325 if (!OCFS2_IS_VALID_DINODE(fe)) {
326 OCFS2_RO_ON_INVALID_DINODE(main_bm_inode->i_sb, fe);
327 ret = -EIO;
328 goto out_unlock;
329 }
330
331 first_new_cluster = le32_to_cpu(fe->i_clusters); 329 first_new_cluster = le32_to_cpu(fe->i_clusters);
332 lgd_blkno = ocfs2_which_cluster_group(main_bm_inode, 330 lgd_blkno = ocfs2_which_cluster_group(main_bm_inode,
333 first_new_cluster - 1); 331 first_new_cluster - 1);
334 332
335 ret = ocfs2_read_block(main_bm_inode, lgd_blkno, &group_bh); 333 ret = ocfs2_read_group_descriptor(main_bm_inode, fe, lgd_blkno,
334 &group_bh);
336 if (ret < 0) { 335 if (ret < 0) {
337 mlog_errno(ret); 336 mlog_errno(ret);
338 goto out_unlock; 337 goto out_unlock;
339 } 338 }
340
341 group = (struct ocfs2_group_desc *)group_bh->b_data; 339 group = (struct ocfs2_group_desc *)group_bh->b_data;
342 340
343 ret = ocfs2_check_group_descriptor(inode->i_sb, fe, group);
344 if (ret) {
345 mlog_errno(ret);
346 goto out_unlock;
347 }
348
349 cl_bpc = le16_to_cpu(fe->id2.i_chain.cl_bpc); 341 cl_bpc = le16_to_cpu(fe->id2.i_chain.cl_bpc);
350 if (le16_to_cpu(group->bg_bits) / cl_bpc + new_clusters > 342 if (le16_to_cpu(group->bg_bits) / cl_bpc + new_clusters >
351 le16_to_cpu(fe->id2.i_chain.cl_cpg)) { 343 le16_to_cpu(fe->id2.i_chain.cl_cpg)) {
@@ -398,41 +390,16 @@ static int ocfs2_check_new_group(struct inode *inode,
398 struct buffer_head *group_bh) 390 struct buffer_head *group_bh)
399{ 391{
400 int ret; 392 int ret;
401 struct ocfs2_group_desc *gd; 393 struct ocfs2_group_desc *gd =
394 (struct ocfs2_group_desc *)group_bh->b_data;
402 u16 cl_bpc = le16_to_cpu(di->id2.i_chain.cl_bpc); 395 u16 cl_bpc = le16_to_cpu(di->id2.i_chain.cl_bpc);
403 unsigned int max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) *
404 le16_to_cpu(di->id2.i_chain.cl_bpc);
405
406 396
407 gd = (struct ocfs2_group_desc *)group_bh->b_data; 397 ret = ocfs2_check_group_descriptor(inode->i_sb, di, group_bh);
398 if (ret)
399 goto out;
408 400
409 ret = -EIO; 401 ret = -EINVAL;
410 if (!OCFS2_IS_VALID_GROUP_DESC(gd)) 402 if (le16_to_cpu(gd->bg_chain) != input->chain)
411 mlog(ML_ERROR, "Group descriptor # %llu isn't valid.\n",
412 (unsigned long long)le64_to_cpu(gd->bg_blkno));
413 else if (di->i_blkno != gd->bg_parent_dinode)
414 mlog(ML_ERROR, "Group descriptor # %llu has bad parent "
415 "pointer (%llu, expected %llu)\n",
416 (unsigned long long)le64_to_cpu(gd->bg_blkno),
417 (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
418 (unsigned long long)le64_to_cpu(di->i_blkno));
419 else if (le16_to_cpu(gd->bg_bits) > max_bits)
420 mlog(ML_ERROR, "Group descriptor # %llu has bit count of %u\n",
421 (unsigned long long)le64_to_cpu(gd->bg_blkno),
422 le16_to_cpu(gd->bg_bits));
423 else if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits))
424 mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but "
425 "claims that %u are free\n",
426 (unsigned long long)le64_to_cpu(gd->bg_blkno),
427 le16_to_cpu(gd->bg_bits),
428 le16_to_cpu(gd->bg_free_bits_count));
429 else if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size)))
430 mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but "
431 "max bitmap bits of %u\n",
432 (unsigned long long)le64_to_cpu(gd->bg_blkno),
433 le16_to_cpu(gd->bg_bits),
434 8 * le16_to_cpu(gd->bg_size));
435 else if (le16_to_cpu(gd->bg_chain) != input->chain)
436 mlog(ML_ERROR, "Group descriptor # %llu has bad chain %u " 403 mlog(ML_ERROR, "Group descriptor # %llu has bad chain %u "
437 "while input has %u set.\n", 404 "while input has %u set.\n",
438 (unsigned long long)le64_to_cpu(gd->bg_blkno), 405 (unsigned long long)le64_to_cpu(gd->bg_blkno),
@@ -451,6 +418,7 @@ static int ocfs2_check_new_group(struct inode *inode,
451 else 418 else
452 ret = 0; 419 ret = 0;
453 420
421out:
454 return ret; 422 return ret;
455} 423}
456 424
@@ -568,8 +536,8 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
568 cl = &fe->id2.i_chain; 536 cl = &fe->id2.i_chain;
569 cr = &cl->cl_recs[input->chain]; 537 cr = &cl->cl_recs[input->chain];
570 538
571 ret = ocfs2_journal_access(handle, main_bm_inode, group_bh, 539 ret = ocfs2_journal_access_gd(handle, main_bm_inode, group_bh,
572 OCFS2_JOURNAL_ACCESS_WRITE); 540 OCFS2_JOURNAL_ACCESS_WRITE);
573 if (ret < 0) { 541 if (ret < 0) {
574 mlog_errno(ret); 542 mlog_errno(ret);
575 goto out_commit; 543 goto out_commit;
@@ -584,8 +552,8 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
584 goto out_commit; 552 goto out_commit;
585 } 553 }
586 554
587 ret = ocfs2_journal_access(handle, main_bm_inode, main_bm_bh, 555 ret = ocfs2_journal_access_di(handle, main_bm_inode, main_bm_bh,
588 OCFS2_JOURNAL_ACCESS_WRITE); 556 OCFS2_JOURNAL_ACCESS_WRITE);
589 if (ret < 0) { 557 if (ret < 0) {
590 mlog_errno(ret); 558 mlog_errno(ret);
591 goto out_commit; 559 goto out_commit;
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index bdda2d8f8508..40661e7824e9 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -151,7 +151,7 @@ int ocfs2_refresh_slot_info(struct ocfs2_super *osb)
151 * this is not true, the read of -1 (UINT64_MAX) will fail. 151 * this is not true, the read of -1 (UINT64_MAX) will fail.
152 */ 152 */
153 ret = ocfs2_read_blocks(si->si_inode, -1, si->si_blocks, si->si_bh, 153 ret = ocfs2_read_blocks(si->si_inode, -1, si->si_blocks, si->si_bh,
154 OCFS2_BH_IGNORE_CACHE); 154 OCFS2_BH_IGNORE_CACHE, NULL);
155 if (ret == 0) { 155 if (ret == 0) {
156 spin_lock(&osb->osb_lock); 156 spin_lock(&osb->osb_lock);
157 ocfs2_update_slot_info(si); 157 ocfs2_update_slot_info(si);
@@ -405,7 +405,7 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
405 405
406 bh = NULL; /* Acquire a fresh bh */ 406 bh = NULL; /* Acquire a fresh bh */
407 status = ocfs2_read_blocks(si->si_inode, blkno, 1, &bh, 407 status = ocfs2_read_blocks(si->si_inode, blkno, 1, &bh,
408 OCFS2_BH_IGNORE_CACHE); 408 OCFS2_BH_IGNORE_CACHE, NULL);
409 if (status < 0) { 409 if (status < 0) {
410 mlog_errno(status); 410 mlog_errno(status);
411 goto bail; 411 goto bail;
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index c5ff18b46b57..a69628603e18 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -35,6 +35,7 @@
35#include "ocfs2.h" 35#include "ocfs2.h"
36 36
37#include "alloc.h" 37#include "alloc.h"
38#include "blockcheck.h"
38#include "dlmglue.h" 39#include "dlmglue.h"
39#include "inode.h" 40#include "inode.h"
40#include "journal.h" 41#include "journal.h"
@@ -145,62 +146,183 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
145 return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc); 146 return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc);
146} 147}
147 148
148/* somewhat more expensive than our other checks, so use sparingly. */ 149#define do_error(fmt, ...) \
149int ocfs2_check_group_descriptor(struct super_block *sb, 150 do{ \
150 struct ocfs2_dinode *di, 151 if (clean_error) \
151 struct ocfs2_group_desc *gd) 152 mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__); \
153 else \
154 ocfs2_error(sb, fmt, ##__VA_ARGS__); \
155 } while (0)
156
157static int ocfs2_validate_gd_self(struct super_block *sb,
158 struct buffer_head *bh,
159 int clean_error)
152{ 160{
153 unsigned int max_bits; 161 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
154 162
155 if (!OCFS2_IS_VALID_GROUP_DESC(gd)) { 163 if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
156 OCFS2_RO_ON_INVALID_GROUP_DESC(sb, gd); 164 do_error("Group descriptor #%llu has bad signature %.*s",
157 return -EIO; 165 (unsigned long long)bh->b_blocknr, 7,
166 gd->bg_signature);
167 return -EINVAL;
158 } 168 }
159 169
170 if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) {
171 do_error("Group descriptor #%llu has an invalid bg_blkno "
172 "of %llu",
173 (unsigned long long)bh->b_blocknr,
174 (unsigned long long)le64_to_cpu(gd->bg_blkno));
175 return -EINVAL;
176 }
177
178 if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) {
179 do_error("Group descriptor #%llu has an invalid "
180 "fs_generation of #%u",
181 (unsigned long long)bh->b_blocknr,
182 le32_to_cpu(gd->bg_generation));
183 return -EINVAL;
184 }
185
186 if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
187 do_error("Group descriptor #%llu has bit count %u but "
188 "claims that %u are free",
189 (unsigned long long)bh->b_blocknr,
190 le16_to_cpu(gd->bg_bits),
191 le16_to_cpu(gd->bg_free_bits_count));
192 return -EINVAL;
193 }
194
195 if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
196 do_error("Group descriptor #%llu has bit count %u but "
197 "max bitmap bits of %u",
198 (unsigned long long)bh->b_blocknr,
199 le16_to_cpu(gd->bg_bits),
200 8 * le16_to_cpu(gd->bg_size));
201 return -EINVAL;
202 }
203
204 return 0;
205}
206
207static int ocfs2_validate_gd_parent(struct super_block *sb,
208 struct ocfs2_dinode *di,
209 struct buffer_head *bh,
210 int clean_error)
211{
212 unsigned int max_bits;
213 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
214
160 if (di->i_blkno != gd->bg_parent_dinode) { 215 if (di->i_blkno != gd->bg_parent_dinode) {
161 ocfs2_error(sb, "Group descriptor # %llu has bad parent " 216 do_error("Group descriptor #%llu has bad parent "
162 "pointer (%llu, expected %llu)", 217 "pointer (%llu, expected %llu)",
163 (unsigned long long)le64_to_cpu(gd->bg_blkno), 218 (unsigned long long)bh->b_blocknr,
164 (unsigned long long)le64_to_cpu(gd->bg_parent_dinode), 219 (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
165 (unsigned long long)le64_to_cpu(di->i_blkno)); 220 (unsigned long long)le64_to_cpu(di->i_blkno));
166 return -EIO; 221 return -EINVAL;
167 } 222 }
168 223
169 max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc); 224 max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc);
170 if (le16_to_cpu(gd->bg_bits) > max_bits) { 225 if (le16_to_cpu(gd->bg_bits) > max_bits) {
171 ocfs2_error(sb, "Group descriptor # %llu has bit count of %u", 226 do_error("Group descriptor #%llu has bit count of %u",
172 (unsigned long long)le64_to_cpu(gd->bg_blkno), 227 (unsigned long long)bh->b_blocknr,
173 le16_to_cpu(gd->bg_bits)); 228 le16_to_cpu(gd->bg_bits));
174 return -EIO; 229 return -EINVAL;
175 } 230 }
176 231
177 if (le16_to_cpu(gd->bg_chain) >= 232 if (le16_to_cpu(gd->bg_chain) >=
178 le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) { 233 le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) {
179 ocfs2_error(sb, "Group descriptor # %llu has bad chain %u", 234 do_error("Group descriptor #%llu has bad chain %u",
180 (unsigned long long)le64_to_cpu(gd->bg_blkno), 235 (unsigned long long)bh->b_blocknr,
181 le16_to_cpu(gd->bg_chain)); 236 le16_to_cpu(gd->bg_chain));
182 return -EIO; 237 return -EINVAL;
183 } 238 }
184 239
185 if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) { 240 return 0;
186 ocfs2_error(sb, "Group descriptor # %llu has bit count %u but " 241}
187 "claims that %u are free",
188 (unsigned long long)le64_to_cpu(gd->bg_blkno),
189 le16_to_cpu(gd->bg_bits),
190 le16_to_cpu(gd->bg_free_bits_count));
191 return -EIO;
192 }
193 242
194 if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) { 243#undef do_error
195 ocfs2_error(sb, "Group descriptor # %llu has bit count %u but " 244
196 "max bitmap bits of %u", 245/*
197 (unsigned long long)le64_to_cpu(gd->bg_blkno), 246 * This version only prints errors. It does not fail the filesystem, and
198 le16_to_cpu(gd->bg_bits), 247 * exists only for resize.
199 8 * le16_to_cpu(gd->bg_size)); 248 */
200 return -EIO; 249int ocfs2_check_group_descriptor(struct super_block *sb,
250 struct ocfs2_dinode *di,
251 struct buffer_head *bh)
252{
253 int rc;
254 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
255
256 BUG_ON(!buffer_uptodate(bh));
257
258 /*
259 * If the ecc fails, we return the error but otherwise
260 * leave the filesystem running. We know any error is
261 * local to this block.
262 */
263 rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
264 if (rc) {
265 mlog(ML_ERROR,
266 "Checksum failed for group descriptor %llu\n",
267 (unsigned long long)bh->b_blocknr);
268 } else
269 rc = ocfs2_validate_gd_self(sb, bh, 1);
270 if (!rc)
271 rc = ocfs2_validate_gd_parent(sb, di, bh, 1);
272
273 return rc;
274}
275
276static int ocfs2_validate_group_descriptor(struct super_block *sb,
277 struct buffer_head *bh)
278{
279 int rc;
280 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
281
282 mlog(0, "Validating group descriptor %llu\n",
283 (unsigned long long)bh->b_blocknr);
284
285 BUG_ON(!buffer_uptodate(bh));
286
287 /*
288 * If the ecc fails, we return the error but otherwise
289 * leave the filesystem running. We know any error is
290 * local to this block.
291 */
292 rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
293 if (rc)
294 return rc;
295
296 /*
297 * Errors after here are fatal.
298 */
299
300 return ocfs2_validate_gd_self(sb, bh, 0);
301}
302
303int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di,
304 u64 gd_blkno, struct buffer_head **bh)
305{
306 int rc;
307 struct buffer_head *tmp = *bh;
308
309 rc = ocfs2_read_block(inode, gd_blkno, &tmp,
310 ocfs2_validate_group_descriptor);
311 if (rc)
312 goto out;
313
314 rc = ocfs2_validate_gd_parent(inode->i_sb, di, tmp, 0);
315 if (rc) {
316 brelse(tmp);
317 goto out;
201 } 318 }
202 319
203 return 0; 320 /* If ocfs2_read_block() got us a new bh, pass it up. */
321 if (!*bh)
322 *bh = tmp;
323
324out:
325 return rc;
204} 326}
205 327
206static int ocfs2_block_group_fill(handle_t *handle, 328static int ocfs2_block_group_fill(handle_t *handle,
@@ -225,10 +347,10 @@ static int ocfs2_block_group_fill(handle_t *handle,
225 goto bail; 347 goto bail;
226 } 348 }
227 349
228 status = ocfs2_journal_access(handle, 350 status = ocfs2_journal_access_gd(handle,
229 alloc_inode, 351 alloc_inode,
230 bg_bh, 352 bg_bh,
231 OCFS2_JOURNAL_ACCESS_CREATE); 353 OCFS2_JOURNAL_ACCESS_CREATE);
232 if (status < 0) { 354 if (status < 0) {
233 mlog_errno(status); 355 mlog_errno(status);
234 goto bail; 356 goto bail;
@@ -358,8 +480,8 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
358 480
359 bg = (struct ocfs2_group_desc *) bg_bh->b_data; 481 bg = (struct ocfs2_group_desc *) bg_bh->b_data;
360 482
361 status = ocfs2_journal_access(handle, alloc_inode, 483 status = ocfs2_journal_access_di(handle, alloc_inode,
362 bh, OCFS2_JOURNAL_ACCESS_WRITE); 484 bh, OCFS2_JOURNAL_ACCESS_WRITE);
363 if (status < 0) { 485 if (status < 0) {
364 mlog_errno(status); 486 mlog_errno(status);
365 goto bail; 487 goto bail;
@@ -441,11 +563,11 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
441 ac->ac_alloc_slot = slot; 563 ac->ac_alloc_slot = slot;
442 564
443 fe = (struct ocfs2_dinode *) bh->b_data; 565 fe = (struct ocfs2_dinode *) bh->b_data;
444 if (!OCFS2_IS_VALID_DINODE(fe)) { 566
445 OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe); 567 /* The bh was validated by the inode read inside
446 status = -EIO; 568 * ocfs2_inode_lock(). Any corruption is a code bug. */
447 goto bail; 569 BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
448 } 570
449 if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) { 571 if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
450 ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu", 572 ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu",
451 (unsigned long long)le64_to_cpu(fe->i_blkno)); 573 (unsigned long long)le64_to_cpu(fe->i_blkno));
@@ -790,10 +912,9 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
790 int offset, start, found, status = 0; 912 int offset, start, found, status = 0;
791 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; 913 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
792 914
793 if (!OCFS2_IS_VALID_GROUP_DESC(bg)) { 915 /* Callers got this descriptor from
794 OCFS2_RO_ON_INVALID_GROUP_DESC(osb->sb, bg); 916 * ocfs2_read_group_descriptor(). Any corruption is a code bug. */
795 return -EIO; 917 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
796 }
797 918
798 found = start = best_offset = best_size = 0; 919 found = start = best_offset = best_size = 0;
799 bitmap = bg->bg_bitmap; 920 bitmap = bg->bg_bitmap;
@@ -858,11 +979,9 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
858 979
859 mlog_entry_void(); 980 mlog_entry_void();
860 981
861 if (!OCFS2_IS_VALID_GROUP_DESC(bg)) { 982 /* All callers get the descriptor via
862 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg); 983 * ocfs2_read_group_descriptor(). Any corruption is a code bug. */
863 status = -EIO; 984 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
864 goto bail;
865 }
866 BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits); 985 BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
867 986
868 mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off, 987 mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
@@ -871,10 +990,10 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
871 if (ocfs2_is_cluster_bitmap(alloc_inode)) 990 if (ocfs2_is_cluster_bitmap(alloc_inode))
872 journal_type = OCFS2_JOURNAL_ACCESS_UNDO; 991 journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
873 992
874 status = ocfs2_journal_access(handle, 993 status = ocfs2_journal_access_gd(handle,
875 alloc_inode, 994 alloc_inode,
876 group_bh, 995 group_bh,
877 journal_type); 996 journal_type);
878 if (status < 0) { 997 if (status < 0) {
879 mlog_errno(status); 998 mlog_errno(status);
880 goto bail; 999 goto bail;
@@ -931,21 +1050,10 @@ static int ocfs2_relink_block_group(handle_t *handle,
931 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; 1050 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
932 struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data; 1051 struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
933 1052
934 if (!OCFS2_IS_VALID_DINODE(fe)) { 1053 /* The caller got these descriptors from
935 OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe); 1054 * ocfs2_read_group_descriptor(). Any corruption is a code bug. */
936 status = -EIO; 1055 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
937 goto out; 1056 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(prev_bg));
938 }
939 if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
940 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
941 status = -EIO;
942 goto out;
943 }
944 if (!OCFS2_IS_VALID_GROUP_DESC(prev_bg)) {
945 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, prev_bg);
946 status = -EIO;
947 goto out;
948 }
949 1057
950 mlog(0, "Suballoc %llu, chain %u, move group %llu to top, prev = %llu\n", 1058 mlog(0, "Suballoc %llu, chain %u, move group %llu to top, prev = %llu\n",
951 (unsigned long long)le64_to_cpu(fe->i_blkno), chain, 1059 (unsigned long long)le64_to_cpu(fe->i_blkno), chain,
@@ -956,8 +1064,8 @@ static int ocfs2_relink_block_group(handle_t *handle,
956 bg_ptr = le64_to_cpu(bg->bg_next_group); 1064 bg_ptr = le64_to_cpu(bg->bg_next_group);
957 prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group); 1065 prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
958 1066
959 status = ocfs2_journal_access(handle, alloc_inode, prev_bg_bh, 1067 status = ocfs2_journal_access_gd(handle, alloc_inode, prev_bg_bh,
960 OCFS2_JOURNAL_ACCESS_WRITE); 1068 OCFS2_JOURNAL_ACCESS_WRITE);
961 if (status < 0) { 1069 if (status < 0) {
962 mlog_errno(status); 1070 mlog_errno(status);
963 goto out_rollback; 1071 goto out_rollback;
@@ -971,8 +1079,8 @@ static int ocfs2_relink_block_group(handle_t *handle,
971 goto out_rollback; 1079 goto out_rollback;
972 } 1080 }
973 1081
974 status = ocfs2_journal_access(handle, alloc_inode, bg_bh, 1082 status = ocfs2_journal_access_gd(handle, alloc_inode, bg_bh,
975 OCFS2_JOURNAL_ACCESS_WRITE); 1083 OCFS2_JOURNAL_ACCESS_WRITE);
976 if (status < 0) { 1084 if (status < 0) {
977 mlog_errno(status); 1085 mlog_errno(status);
978 goto out_rollback; 1086 goto out_rollback;
@@ -986,8 +1094,8 @@ static int ocfs2_relink_block_group(handle_t *handle,
986 goto out_rollback; 1094 goto out_rollback;
987 } 1095 }
988 1096
989 status = ocfs2_journal_access(handle, alloc_inode, fe_bh, 1097 status = ocfs2_journal_access_di(handle, alloc_inode, fe_bh,
990 OCFS2_JOURNAL_ACCESS_WRITE); 1098 OCFS2_JOURNAL_ACCESS_WRITE);
991 if (status < 0) { 1099 if (status < 0) {
992 mlog_errno(status); 1100 mlog_errno(status);
993 goto out_rollback; 1101 goto out_rollback;
@@ -1008,7 +1116,7 @@ out_rollback:
1008 bg->bg_next_group = cpu_to_le64(bg_ptr); 1116 bg->bg_next_group = cpu_to_le64(bg_ptr);
1009 prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr); 1117 prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
1010 } 1118 }
1011out: 1119
1012 mlog_exit(status); 1120 mlog_exit(status);
1013 return status; 1121 return status;
1014} 1122}
@@ -1138,8 +1246,8 @@ static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
1138 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; 1246 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
1139 struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain; 1247 struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain;
1140 1248
1141 ret = ocfs2_journal_access(handle, inode, di_bh, 1249 ret = ocfs2_journal_access_di(handle, inode, di_bh,
1142 OCFS2_JOURNAL_ACCESS_WRITE); 1250 OCFS2_JOURNAL_ACCESS_WRITE);
1143 if (ret < 0) { 1251 if (ret < 0) {
1144 mlog_errno(ret); 1252 mlog_errno(ret);
1145 goto out; 1253 goto out;
@@ -1170,21 +1278,17 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
1170 u16 found; 1278 u16 found;
1171 struct buffer_head *group_bh = NULL; 1279 struct buffer_head *group_bh = NULL;
1172 struct ocfs2_group_desc *gd; 1280 struct ocfs2_group_desc *gd;
1281 struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
1173 struct inode *alloc_inode = ac->ac_inode; 1282 struct inode *alloc_inode = ac->ac_inode;
1174 1283
1175 ret = ocfs2_read_block(alloc_inode, gd_blkno, &group_bh); 1284 ret = ocfs2_read_group_descriptor(alloc_inode, di, gd_blkno,
1285 &group_bh);
1176 if (ret < 0) { 1286 if (ret < 0) {
1177 mlog_errno(ret); 1287 mlog_errno(ret);
1178 return ret; 1288 return ret;
1179 } 1289 }
1180 1290
1181 gd = (struct ocfs2_group_desc *) group_bh->b_data; 1291 gd = (struct ocfs2_group_desc *) group_bh->b_data;
1182 if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
1183 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, gd);
1184 ret = -EIO;
1185 goto out;
1186 }
1187
1188 ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits, 1292 ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
1189 ac->ac_max_block, bit_off, &found); 1293 ac->ac_max_block, bit_off, &found);
1190 if (ret < 0) { 1294 if (ret < 0) {
@@ -1241,19 +1345,14 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1241 bits_wanted, chain, 1345 bits_wanted, chain,
1242 (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno); 1346 (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno);
1243 1347
1244 status = ocfs2_read_block(alloc_inode, 1348 status = ocfs2_read_group_descriptor(alloc_inode, fe,
1245 le64_to_cpu(cl->cl_recs[chain].c_blkno), 1349 le64_to_cpu(cl->cl_recs[chain].c_blkno),
1246 &group_bh); 1350 &group_bh);
1247 if (status < 0) { 1351 if (status < 0) {
1248 mlog_errno(status); 1352 mlog_errno(status);
1249 goto bail; 1353 goto bail;
1250 } 1354 }
1251 bg = (struct ocfs2_group_desc *) group_bh->b_data; 1355 bg = (struct ocfs2_group_desc *) group_bh->b_data;
1252 status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, bg);
1253 if (status) {
1254 mlog_errno(status);
1255 goto bail;
1256 }
1257 1356
1258 status = -ENOSPC; 1357 status = -ENOSPC;
1259 /* for now, the chain search is a bit simplistic. We just use 1358 /* for now, the chain search is a bit simplistic. We just use
@@ -1271,18 +1370,13 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1271 next_group = le64_to_cpu(bg->bg_next_group); 1370 next_group = le64_to_cpu(bg->bg_next_group);
1272 prev_group_bh = group_bh; 1371 prev_group_bh = group_bh;
1273 group_bh = NULL; 1372 group_bh = NULL;
1274 status = ocfs2_read_block(alloc_inode, 1373 status = ocfs2_read_group_descriptor(alloc_inode, fe,
1275 next_group, &group_bh); 1374 next_group, &group_bh);
1276 if (status < 0) { 1375 if (status < 0) {
1277 mlog_errno(status); 1376 mlog_errno(status);
1278 goto bail; 1377 goto bail;
1279 } 1378 }
1280 bg = (struct ocfs2_group_desc *) group_bh->b_data; 1379 bg = (struct ocfs2_group_desc *) group_bh->b_data;
1281 status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, bg);
1282 if (status) {
1283 mlog_errno(status);
1284 goto bail;
1285 }
1286 } 1380 }
1287 if (status < 0) { 1381 if (status < 0) {
1288 if (status != -ENOSPC) 1382 if (status != -ENOSPC)
@@ -1324,10 +1418,10 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1324 1418
1325 /* Ok, claim our bits now: set the info on dinode, chainlist 1419 /* Ok, claim our bits now: set the info on dinode, chainlist
1326 * and then the group */ 1420 * and then the group */
1327 status = ocfs2_journal_access(handle, 1421 status = ocfs2_journal_access_di(handle,
1328 alloc_inode, 1422 alloc_inode,
1329 ac->ac_bh, 1423 ac->ac_bh,
1330 OCFS2_JOURNAL_ACCESS_WRITE); 1424 OCFS2_JOURNAL_ACCESS_WRITE);
1331 if (status < 0) { 1425 if (status < 0) {
1332 mlog_errno(status); 1426 mlog_errno(status);
1333 goto bail; 1427 goto bail;
@@ -1392,11 +1486,11 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
1392 BUG_ON(!ac->ac_bh); 1486 BUG_ON(!ac->ac_bh);
1393 1487
1394 fe = (struct ocfs2_dinode *) ac->ac_bh->b_data; 1488 fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1395 if (!OCFS2_IS_VALID_DINODE(fe)) { 1489
1396 OCFS2_RO_ON_INVALID_DINODE(osb->sb, fe); 1490 /* The bh was validated by the inode read during
1397 status = -EIO; 1491 * ocfs2_reserve_suballoc_bits(). Any corruption is a code bug. */
1398 goto bail; 1492 BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
1399 } 1493
1400 if (le32_to_cpu(fe->id1.bitmap1.i_used) >= 1494 if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
1401 le32_to_cpu(fe->id1.bitmap1.i_total)) { 1495 le32_to_cpu(fe->id1.bitmap1.i_total)) {
1402 ocfs2_error(osb->sb, "Chain allocator dinode %llu has %u used " 1496 ocfs2_error(osb->sb, "Chain allocator dinode %llu has %u used "
@@ -1725,19 +1819,17 @@ static inline int ocfs2_block_group_clear_bits(handle_t *handle,
1725 1819
1726 mlog_entry_void(); 1820 mlog_entry_void();
1727 1821
1728 if (!OCFS2_IS_VALID_GROUP_DESC(bg)) { 1822 /* The caller got this descriptor from
1729 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg); 1823 * ocfs2_read_group_descriptor(). Any corruption is a code bug. */
1730 status = -EIO; 1824 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1731 goto bail;
1732 }
1733 1825
1734 mlog(0, "off = %u, num = %u\n", bit_off, num_bits); 1826 mlog(0, "off = %u, num = %u\n", bit_off, num_bits);
1735 1827
1736 if (ocfs2_is_cluster_bitmap(alloc_inode)) 1828 if (ocfs2_is_cluster_bitmap(alloc_inode))
1737 journal_type = OCFS2_JOURNAL_ACCESS_UNDO; 1829 journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
1738 1830
1739 status = ocfs2_journal_access(handle, alloc_inode, group_bh, 1831 status = ocfs2_journal_access_gd(handle, alloc_inode, group_bh,
1740 journal_type); 1832 journal_type);
1741 if (status < 0) { 1833 if (status < 0) {
1742 mlog_errno(status); 1834 mlog_errno(status);
1743 goto bail; 1835 goto bail;
@@ -1782,29 +1874,26 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
1782 1874
1783 mlog_entry_void(); 1875 mlog_entry_void();
1784 1876
1785 if (!OCFS2_IS_VALID_DINODE(fe)) { 1877 /* The alloc_bh comes from ocfs2_free_dinode() or
1786 OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe); 1878 * ocfs2_free_clusters(). The callers have all locked the
1787 status = -EIO; 1879 * allocator and gotten alloc_bh from the lock call. This
1788 goto bail; 1880 * validates the dinode buffer. Any corruption that has happended
1789 } 1881 * is a code bug. */
1882 BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
1790 BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl)); 1883 BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl));
1791 1884
1792 mlog(0, "%llu: freeing %u bits from group %llu, starting at %u\n", 1885 mlog(0, "%llu: freeing %u bits from group %llu, starting at %u\n",
1793 (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, count, 1886 (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, count,
1794 (unsigned long long)bg_blkno, start_bit); 1887 (unsigned long long)bg_blkno, start_bit);
1795 1888
1796 status = ocfs2_read_block(alloc_inode, bg_blkno, &group_bh); 1889 status = ocfs2_read_group_descriptor(alloc_inode, fe, bg_blkno,
1890 &group_bh);
1797 if (status < 0) { 1891 if (status < 0) {
1798 mlog_errno(status); 1892 mlog_errno(status);
1799 goto bail; 1893 goto bail;
1800 } 1894 }
1801
1802 group = (struct ocfs2_group_desc *) group_bh->b_data; 1895 group = (struct ocfs2_group_desc *) group_bh->b_data;
1803 status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, group); 1896
1804 if (status) {
1805 mlog_errno(status);
1806 goto bail;
1807 }
1808 BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits)); 1897 BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits));
1809 1898
1810 status = ocfs2_block_group_clear_bits(handle, alloc_inode, 1899 status = ocfs2_block_group_clear_bits(handle, alloc_inode,
@@ -1815,8 +1904,8 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
1815 goto bail; 1904 goto bail;
1816 } 1905 }
1817 1906
1818 status = ocfs2_journal_access(handle, alloc_inode, alloc_bh, 1907 status = ocfs2_journal_access_di(handle, alloc_inode, alloc_bh,
1819 OCFS2_JOURNAL_ACCESS_WRITE); 1908 OCFS2_JOURNAL_ACCESS_WRITE);
1820 if (status < 0) { 1909 if (status < 0) {
1821 mlog_errno(status); 1910 mlog_errno(status);
1822 goto bail; 1911 goto bail;
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index 4df159d8f450..e3c13c77f9e8 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -164,10 +164,24 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac);
164 * and return that block offset. */ 164 * and return that block offset. */
165u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster); 165u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster);
166 166
167/* somewhat more expensive than our other checks, so use sparingly. */ 167/*
168 * By default, ocfs2_read_group_descriptor() calls ocfs2_error() when it
169 * finds a problem. A caller that wants to check a group descriptor
170 * without going readonly should read the block with ocfs2_read_block[s]()
171 * and then checking it with this function. This is only resize, really.
172 * Everyone else should be using ocfs2_read_group_descriptor().
173 */
168int ocfs2_check_group_descriptor(struct super_block *sb, 174int ocfs2_check_group_descriptor(struct super_block *sb,
169 struct ocfs2_dinode *di, 175 struct ocfs2_dinode *di,
170 struct ocfs2_group_desc *gd); 176 struct buffer_head *bh);
177/*
178 * Read a group descriptor block into *bh. If *bh is NULL, a bh will be
179 * allocated. This is a cached read. The descriptor will be validated with
180 * ocfs2_validate_group_descriptor().
181 */
182int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di,
183 u64 gd_blkno, struct buffer_head **bh);
184
171int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et, 185int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et,
172 u32 clusters_to_add, u32 extents_to_split, 186 u32 clusters_to_add, u32 extents_to_split,
173 struct ocfs2_alloc_context **data_ac, 187 struct ocfs2_alloc_context **data_ac,
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 304b63ac78cf..b1cb38fbe807 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -41,6 +41,7 @@
41#include <linux/debugfs.h> 41#include <linux/debugfs.h>
42#include <linux/mount.h> 42#include <linux/mount.h>
43#include <linux/seq_file.h> 43#include <linux/seq_file.h>
44#include <linux/quotaops.h>
44 45
45#define MLOG_MASK_PREFIX ML_SUPER 46#define MLOG_MASK_PREFIX ML_SUPER
46#include <cluster/masklog.h> 47#include <cluster/masklog.h>
@@ -51,6 +52,7 @@
51#include "ocfs1_fs_compat.h" 52#include "ocfs1_fs_compat.h"
52 53
53#include "alloc.h" 54#include "alloc.h"
55#include "blockcheck.h"
54#include "dlmglue.h" 56#include "dlmglue.h"
55#include "export.h" 57#include "export.h"
56#include "extent_map.h" 58#include "extent_map.h"
@@ -65,10 +67,13 @@
65#include "uptodate.h" 67#include "uptodate.h"
66#include "ver.h" 68#include "ver.h"
67#include "xattr.h" 69#include "xattr.h"
70#include "quota.h"
68 71
69#include "buffer_head_io.h" 72#include "buffer_head_io.h"
70 73
71static struct kmem_cache *ocfs2_inode_cachep = NULL; 74static struct kmem_cache *ocfs2_inode_cachep = NULL;
75struct kmem_cache *ocfs2_dquot_cachep;
76struct kmem_cache *ocfs2_qf_chunk_cachep;
72 77
73/* OCFS2 needs to schedule several differnt types of work which 78/* OCFS2 needs to schedule several differnt types of work which
74 * require cluster locking, disk I/O, recovery waits, etc. Since these 79 * require cluster locking, disk I/O, recovery waits, etc. Since these
@@ -124,6 +129,9 @@ static int ocfs2_get_sector(struct super_block *sb,
124static void ocfs2_write_super(struct super_block *sb); 129static void ocfs2_write_super(struct super_block *sb);
125static struct inode *ocfs2_alloc_inode(struct super_block *sb); 130static struct inode *ocfs2_alloc_inode(struct super_block *sb);
126static void ocfs2_destroy_inode(struct inode *inode); 131static void ocfs2_destroy_inode(struct inode *inode);
132static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend);
133static int ocfs2_enable_quotas(struct ocfs2_super *osb);
134static void ocfs2_disable_quotas(struct ocfs2_super *osb);
127 135
128static const struct super_operations ocfs2_sops = { 136static const struct super_operations ocfs2_sops = {
129 .statfs = ocfs2_statfs, 137 .statfs = ocfs2_statfs,
@@ -137,6 +145,8 @@ static const struct super_operations ocfs2_sops = {
137 .put_super = ocfs2_put_super, 145 .put_super = ocfs2_put_super,
138 .remount_fs = ocfs2_remount, 146 .remount_fs = ocfs2_remount,
139 .show_options = ocfs2_show_options, 147 .show_options = ocfs2_show_options,
148 .quota_read = ocfs2_quota_read,
149 .quota_write = ocfs2_quota_write,
140}; 150};
141 151
142enum { 152enum {
@@ -158,6 +168,10 @@ enum {
158 Opt_user_xattr, 168 Opt_user_xattr,
159 Opt_nouser_xattr, 169 Opt_nouser_xattr,
160 Opt_inode64, 170 Opt_inode64,
171 Opt_acl,
172 Opt_noacl,
173 Opt_usrquota,
174 Opt_grpquota,
161 Opt_err, 175 Opt_err,
162}; 176};
163 177
@@ -180,6 +194,10 @@ static const match_table_t tokens = {
180 {Opt_user_xattr, "user_xattr"}, 194 {Opt_user_xattr, "user_xattr"},
181 {Opt_nouser_xattr, "nouser_xattr"}, 195 {Opt_nouser_xattr, "nouser_xattr"},
182 {Opt_inode64, "inode64"}, 196 {Opt_inode64, "inode64"},
197 {Opt_acl, "acl"},
198 {Opt_noacl, "noacl"},
199 {Opt_usrquota, "usrquota"},
200 {Opt_grpquota, "grpquota"},
183 {Opt_err, NULL} 201 {Opt_err, NULL}
184}; 202};
185 203
@@ -221,6 +239,19 @@ static int ocfs2_sync_fs(struct super_block *sb, int wait)
221 return 0; 239 return 0;
222} 240}
223 241
242static int ocfs2_need_system_inode(struct ocfs2_super *osb, int ino)
243{
244 if (!OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_RO_COMPAT_USRQUOTA)
245 && (ino == USER_QUOTA_SYSTEM_INODE
246 || ino == LOCAL_USER_QUOTA_SYSTEM_INODE))
247 return 0;
248 if (!OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
249 && (ino == GROUP_QUOTA_SYSTEM_INODE
250 || ino == LOCAL_GROUP_QUOTA_SYSTEM_INODE))
251 return 0;
252 return 1;
253}
254
224static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb) 255static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
225{ 256{
226 struct inode *new = NULL; 257 struct inode *new = NULL;
@@ -247,6 +278,8 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
247 278
248 for (i = OCFS2_FIRST_ONLINE_SYSTEM_INODE; 279 for (i = OCFS2_FIRST_ONLINE_SYSTEM_INODE;
249 i <= OCFS2_LAST_GLOBAL_SYSTEM_INODE; i++) { 280 i <= OCFS2_LAST_GLOBAL_SYSTEM_INODE; i++) {
281 if (!ocfs2_need_system_inode(osb, i))
282 continue;
250 new = ocfs2_get_system_file_inode(osb, i, osb->slot_num); 283 new = ocfs2_get_system_file_inode(osb, i, osb->slot_num);
251 if (!new) { 284 if (!new) {
252 ocfs2_release_system_inodes(osb); 285 ocfs2_release_system_inodes(osb);
@@ -277,6 +310,8 @@ static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb)
277 for (i = OCFS2_LAST_GLOBAL_SYSTEM_INODE + 1; 310 for (i = OCFS2_LAST_GLOBAL_SYSTEM_INODE + 1;
278 i < NUM_SYSTEM_INODES; 311 i < NUM_SYSTEM_INODES;
279 i++) { 312 i++) {
313 if (!ocfs2_need_system_inode(osb, i))
314 continue;
280 new = ocfs2_get_system_file_inode(osb, i, osb->slot_num); 315 new = ocfs2_get_system_file_inode(osb, i, osb->slot_num);
281 if (!new) { 316 if (!new) {
282 ocfs2_release_system_inodes(osb); 317 ocfs2_release_system_inodes(osb);
@@ -426,6 +461,12 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
426 461
427 /* We're going to/from readonly mode. */ 462 /* We're going to/from readonly mode. */
428 if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) { 463 if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
464 /* Disable quota accounting before remounting RO */
465 if (*flags & MS_RDONLY) {
466 ret = ocfs2_susp_quotas(osb, 0);
467 if (ret < 0)
468 goto out;
469 }
429 /* Lock here so the check of HARD_RO and the potential 470 /* Lock here so the check of HARD_RO and the potential
430 * setting of SOFT_RO is atomic. */ 471 * setting of SOFT_RO is atomic. */
431 spin_lock(&osb->osb_lock); 472 spin_lock(&osb->osb_lock);
@@ -461,11 +502,28 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
461 } 502 }
462unlock_osb: 503unlock_osb:
463 spin_unlock(&osb->osb_lock); 504 spin_unlock(&osb->osb_lock);
505 /* Enable quota accounting after remounting RW */
506 if (!ret && !(*flags & MS_RDONLY)) {
507 if (sb_any_quota_suspended(sb))
508 ret = ocfs2_susp_quotas(osb, 1);
509 else
510 ret = ocfs2_enable_quotas(osb);
511 if (ret < 0) {
512 /* Return back changes... */
513 spin_lock(&osb->osb_lock);
514 sb->s_flags |= MS_RDONLY;
515 osb->osb_flags |= OCFS2_OSB_SOFT_RO;
516 spin_unlock(&osb->osb_lock);
517 goto out;
518 }
519 }
464 } 520 }
465 521
466 if (!ret) { 522 if (!ret) {
467 /* Only save off the new mount options in case of a successful 523 /* Only save off the new mount options in case of a successful
468 * remount. */ 524 * remount. */
525 if (!(osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR))
526 parsed_options.mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
469 osb->s_mount_opt = parsed_options.mount_opt; 527 osb->s_mount_opt = parsed_options.mount_opt;
470 osb->s_atime_quantum = parsed_options.atime_quantum; 528 osb->s_atime_quantum = parsed_options.atime_quantum;
471 osb->preferred_slot = parsed_options.slot; 529 osb->preferred_slot = parsed_options.slot;
@@ -619,6 +677,131 @@ static int ocfs2_verify_userspace_stack(struct ocfs2_super *osb,
619 return 0; 677 return 0;
620} 678}
621 679
680static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend)
681{
682 int type;
683 struct super_block *sb = osb->sb;
684 unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
685 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
686 int status = 0;
687
688 for (type = 0; type < MAXQUOTAS; type++) {
689 if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
690 continue;
691 if (unsuspend)
692 status = vfs_quota_enable(
693 sb_dqopt(sb)->files[type],
694 type, QFMT_OCFS2,
695 DQUOT_SUSPENDED);
696 else
697 status = vfs_quota_disable(sb, type,
698 DQUOT_SUSPENDED);
699 if (status < 0)
700 break;
701 }
702 if (status < 0)
703 mlog(ML_ERROR, "Failed to suspend/unsuspend quotas on "
704 "remount (error = %d).\n", status);
705 return status;
706}
707
708static int ocfs2_enable_quotas(struct ocfs2_super *osb)
709{
710 struct inode *inode[MAXQUOTAS] = { NULL, NULL };
711 struct super_block *sb = osb->sb;
712 unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
713 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
714 unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
715 LOCAL_GROUP_QUOTA_SYSTEM_INODE };
716 int status;
717 int type;
718
719 sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NEGATIVE_USAGE;
720 for (type = 0; type < MAXQUOTAS; type++) {
721 if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
722 continue;
723 inode[type] = ocfs2_get_system_file_inode(osb, ino[type],
724 osb->slot_num);
725 if (!inode[type]) {
726 status = -ENOENT;
727 goto out_quota_off;
728 }
729 status = vfs_quota_enable(inode[type], type, QFMT_OCFS2,
730 DQUOT_USAGE_ENABLED);
731 if (status < 0)
732 goto out_quota_off;
733 }
734
735 for (type = 0; type < MAXQUOTAS; type++)
736 iput(inode[type]);
737 return 0;
738out_quota_off:
739 ocfs2_disable_quotas(osb);
740 for (type = 0; type < MAXQUOTAS; type++)
741 iput(inode[type]);
742 mlog_errno(status);
743 return status;
744}
745
746static void ocfs2_disable_quotas(struct ocfs2_super *osb)
747{
748 int type;
749 struct inode *inode;
750 struct super_block *sb = osb->sb;
751
752 /* We mostly ignore errors in this function because there's not much
753 * we can do when we see them */
754 for (type = 0; type < MAXQUOTAS; type++) {
755 if (!sb_has_quota_loaded(sb, type))
756 continue;
757 inode = igrab(sb->s_dquot.files[type]);
758 /* Turn off quotas. This will remove all dquot structures from
759 * memory and so they will be automatically synced to global
760 * quota files */
761 vfs_quota_disable(sb, type, DQUOT_USAGE_ENABLED |
762 DQUOT_LIMITS_ENABLED);
763 if (!inode)
764 continue;
765 iput(inode);
766 }
767}
768
769/* Handle quota on quotactl */
770static int ocfs2_quota_on(struct super_block *sb, int type, int format_id,
771 char *path, int remount)
772{
773 unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
774 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
775
776 if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
777 return -EINVAL;
778
779 if (remount)
780 return 0; /* Just ignore it has been handled in
781 * ocfs2_remount() */
782 return vfs_quota_enable(sb_dqopt(sb)->files[type], type,
783 format_id, DQUOT_LIMITS_ENABLED);
784}
785
786/* Handle quota off quotactl */
787static int ocfs2_quota_off(struct super_block *sb, int type, int remount)
788{
789 if (remount)
790 return 0; /* Ignore now and handle later in
791 * ocfs2_remount() */
792 return vfs_quota_disable(sb, type, DQUOT_LIMITS_ENABLED);
793}
794
795static struct quotactl_ops ocfs2_quotactl_ops = {
796 .quota_on = ocfs2_quota_on,
797 .quota_off = ocfs2_quota_off,
798 .quota_sync = vfs_quota_sync,
799 .get_info = vfs_get_dqinfo,
800 .set_info = vfs_set_dqinfo,
801 .get_dqblk = vfs_get_dqblk,
802 .set_dqblk = vfs_set_dqblk,
803};
804
622static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) 805static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
623{ 806{
624 struct dentry *root; 807 struct dentry *root;
@@ -651,12 +834,32 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
651 } 834 }
652 brelse(bh); 835 brelse(bh);
653 bh = NULL; 836 bh = NULL;
837
838 if (!(osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR))
839 parsed_options.mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
840
654 osb->s_mount_opt = parsed_options.mount_opt; 841 osb->s_mount_opt = parsed_options.mount_opt;
655 osb->s_atime_quantum = parsed_options.atime_quantum; 842 osb->s_atime_quantum = parsed_options.atime_quantum;
656 osb->preferred_slot = parsed_options.slot; 843 osb->preferred_slot = parsed_options.slot;
657 osb->osb_commit_interval = parsed_options.commit_interval; 844 osb->osb_commit_interval = parsed_options.commit_interval;
658 osb->local_alloc_default_bits = ocfs2_megabytes_to_clusters(sb, parsed_options.localalloc_opt); 845 osb->local_alloc_default_bits = ocfs2_megabytes_to_clusters(sb, parsed_options.localalloc_opt);
659 osb->local_alloc_bits = osb->local_alloc_default_bits; 846 osb->local_alloc_bits = osb->local_alloc_default_bits;
847 if (osb->s_mount_opt & OCFS2_MOUNT_USRQUOTA &&
848 !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
849 OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
850 status = -EINVAL;
851 mlog(ML_ERROR, "User quotas were requested, but this "
852 "filesystem does not have the feature enabled.\n");
853 goto read_super_error;
854 }
855 if (osb->s_mount_opt & OCFS2_MOUNT_GRPQUOTA &&
856 !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
857 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
858 status = -EINVAL;
859 mlog(ML_ERROR, "Group quotas were requested, but this "
860 "filesystem does not have the feature enabled.\n");
861 goto read_super_error;
862 }
660 863
661 status = ocfs2_verify_userspace_stack(osb, &parsed_options); 864 status = ocfs2_verify_userspace_stack(osb, &parsed_options);
662 if (status) 865 if (status)
@@ -664,6 +867,9 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
664 867
665 sb->s_magic = OCFS2_SUPER_MAGIC; 868 sb->s_magic = OCFS2_SUPER_MAGIC;
666 869
870 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
871 ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
872
667 /* Hard readonly mode only if: bdev_read_only, MS_RDONLY, 873 /* Hard readonly mode only if: bdev_read_only, MS_RDONLY,
668 * heartbeat=none */ 874 * heartbeat=none */
669 if (bdev_read_only(sb->s_bdev)) { 875 if (bdev_read_only(sb->s_bdev)) {
@@ -758,6 +964,28 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
758 atomic_set(&osb->vol_state, VOLUME_MOUNTED); 964 atomic_set(&osb->vol_state, VOLUME_MOUNTED);
759 wake_up(&osb->osb_mount_event); 965 wake_up(&osb->osb_mount_event);
760 966
967 /* Now we can initialize quotas because we can afford to wait
968 * for cluster locks recovery now. That also means that truncation
969 * log recovery can happen but that waits for proper quota setup */
970 if (!(sb->s_flags & MS_RDONLY)) {
971 status = ocfs2_enable_quotas(osb);
972 if (status < 0) {
973 /* We have to err-out specially here because
974 * s_root is already set */
975 mlog_errno(status);
976 atomic_set(&osb->vol_state, VOLUME_DISABLED);
977 wake_up(&osb->osb_mount_event);
978 mlog_exit(status);
979 return status;
980 }
981 }
982
983 ocfs2_complete_quota_recovery(osb);
984
985 /* Now we wake up again for processes waiting for quotas */
986 atomic_set(&osb->vol_state, VOLUME_MOUNTED_QUOTAS);
987 wake_up(&osb->osb_mount_event);
988
761 mlog_exit(status); 989 mlog_exit(status);
762 return status; 990 return status;
763 991
@@ -945,6 +1173,41 @@ static int ocfs2_parse_options(struct super_block *sb,
945 case Opt_inode64: 1173 case Opt_inode64:
946 mopt->mount_opt |= OCFS2_MOUNT_INODE64; 1174 mopt->mount_opt |= OCFS2_MOUNT_INODE64;
947 break; 1175 break;
1176 case Opt_usrquota:
1177 /* We check only on remount, otherwise features
1178 * aren't yet initialized. */
1179 if (is_remount && !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
1180 OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
1181 mlog(ML_ERROR, "User quota requested but "
1182 "filesystem feature is not set\n");
1183 status = 0;
1184 goto bail;
1185 }
1186 mopt->mount_opt |= OCFS2_MOUNT_USRQUOTA;
1187 break;
1188 case Opt_grpquota:
1189 if (is_remount && !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
1190 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
1191 mlog(ML_ERROR, "Group quota requested but "
1192 "filesystem feature is not set\n");
1193 status = 0;
1194 goto bail;
1195 }
1196 mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA;
1197 break;
1198#ifdef CONFIG_OCFS2_FS_POSIX_ACL
1199 case Opt_acl:
1200 mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL;
1201 break;
1202 case Opt_noacl:
1203 mopt->mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
1204 break;
1205#else
1206 case Opt_acl:
1207 case Opt_noacl:
1208 printk(KERN_INFO "ocfs2 (no)acl options not supported\n");
1209 break;
1210#endif
948 default: 1211 default:
949 mlog(ML_ERROR, 1212 mlog(ML_ERROR,
950 "Unrecognized mount option \"%s\" " 1213 "Unrecognized mount option \"%s\" "
@@ -1008,6 +1271,10 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
1008 if (osb->osb_cluster_stack[0]) 1271 if (osb->osb_cluster_stack[0])
1009 seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN, 1272 seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN,
1010 osb->osb_cluster_stack); 1273 osb->osb_cluster_stack);
1274 if (opts & OCFS2_MOUNT_USRQUOTA)
1275 seq_printf(s, ",usrquota");
1276 if (opts & OCFS2_MOUNT_GRPQUOTA)
1277 seq_printf(s, ",grpquota");
1011 1278
1012 if (opts & OCFS2_MOUNT_NOUSERXATTR) 1279 if (opts & OCFS2_MOUNT_NOUSERXATTR)
1013 seq_printf(s, ",nouser_xattr"); 1280 seq_printf(s, ",nouser_xattr");
@@ -1017,6 +1284,13 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
1017 if (opts & OCFS2_MOUNT_INODE64) 1284 if (opts & OCFS2_MOUNT_INODE64)
1018 seq_printf(s, ",inode64"); 1285 seq_printf(s, ",inode64");
1019 1286
1287#ifdef CONFIG_OCFS2_FS_POSIX_ACL
1288 if (opts & OCFS2_MOUNT_POSIX_ACL)
1289 seq_printf(s, ",acl");
1290 else
1291 seq_printf(s, ",noacl");
1292#endif
1293
1020 return 0; 1294 return 0;
1021} 1295}
1022 1296
@@ -1052,10 +1326,16 @@ static int __init ocfs2_init(void)
1052 mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n"); 1326 mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
1053 } 1327 }
1054 1328
1329 status = ocfs2_quota_setup();
1330 if (status)
1331 goto leave;
1332
1055 ocfs2_set_locking_protocol(); 1333 ocfs2_set_locking_protocol();
1056 1334
1335 status = register_quota_format(&ocfs2_quota_format);
1057leave: 1336leave:
1058 if (status < 0) { 1337 if (status < 0) {
1338 ocfs2_quota_shutdown();
1059 ocfs2_free_mem_caches(); 1339 ocfs2_free_mem_caches();
1060 exit_ocfs2_uptodate_cache(); 1340 exit_ocfs2_uptodate_cache();
1061 } 1341 }
@@ -1072,11 +1352,15 @@ static void __exit ocfs2_exit(void)
1072{ 1352{
1073 mlog_entry_void(); 1353 mlog_entry_void();
1074 1354
1355 ocfs2_quota_shutdown();
1356
1075 if (ocfs2_wq) { 1357 if (ocfs2_wq) {
1076 flush_workqueue(ocfs2_wq); 1358 flush_workqueue(ocfs2_wq);
1077 destroy_workqueue(ocfs2_wq); 1359 destroy_workqueue(ocfs2_wq);
1078 } 1360 }
1079 1361
1362 unregister_quota_format(&ocfs2_quota_format);
1363
1080 debugfs_remove(ocfs2_debugfs_root); 1364 debugfs_remove(ocfs2_debugfs_root);
1081 1365
1082 ocfs2_free_mem_caches(); 1366 ocfs2_free_mem_caches();
@@ -1192,8 +1476,27 @@ static int ocfs2_initialize_mem_caches(void)
1192 (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| 1476 (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
1193 SLAB_MEM_SPREAD), 1477 SLAB_MEM_SPREAD),
1194 ocfs2_inode_init_once); 1478 ocfs2_inode_init_once);
1195 if (!ocfs2_inode_cachep) 1479 ocfs2_dquot_cachep = kmem_cache_create("ocfs2_dquot_cache",
1480 sizeof(struct ocfs2_dquot),
1481 0,
1482 (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
1483 SLAB_MEM_SPREAD),
1484 NULL);
1485 ocfs2_qf_chunk_cachep = kmem_cache_create("ocfs2_qf_chunk_cache",
1486 sizeof(struct ocfs2_quota_chunk),
1487 0,
1488 (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
1489 NULL);
1490 if (!ocfs2_inode_cachep || !ocfs2_dquot_cachep ||
1491 !ocfs2_qf_chunk_cachep) {
1492 if (ocfs2_inode_cachep)
1493 kmem_cache_destroy(ocfs2_inode_cachep);
1494 if (ocfs2_dquot_cachep)
1495 kmem_cache_destroy(ocfs2_dquot_cachep);
1496 if (ocfs2_qf_chunk_cachep)
1497 kmem_cache_destroy(ocfs2_qf_chunk_cachep);
1196 return -ENOMEM; 1498 return -ENOMEM;
1499 }
1197 1500
1198 return 0; 1501 return 0;
1199} 1502}
@@ -1202,8 +1505,15 @@ static void ocfs2_free_mem_caches(void)
1202{ 1505{
1203 if (ocfs2_inode_cachep) 1506 if (ocfs2_inode_cachep)
1204 kmem_cache_destroy(ocfs2_inode_cachep); 1507 kmem_cache_destroy(ocfs2_inode_cachep);
1205
1206 ocfs2_inode_cachep = NULL; 1508 ocfs2_inode_cachep = NULL;
1509
1510 if (ocfs2_dquot_cachep)
1511 kmem_cache_destroy(ocfs2_dquot_cachep);
1512 ocfs2_dquot_cachep = NULL;
1513
1514 if (ocfs2_qf_chunk_cachep)
1515 kmem_cache_destroy(ocfs2_qf_chunk_cachep);
1516 ocfs2_qf_chunk_cachep = NULL;
1207} 1517}
1208 1518
1209static int ocfs2_get_sector(struct super_block *sb, 1519static int ocfs2_get_sector(struct super_block *sb,
@@ -1303,6 +1613,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
1303 osb = OCFS2_SB(sb); 1613 osb = OCFS2_SB(sb);
1304 BUG_ON(!osb); 1614 BUG_ON(!osb);
1305 1615
1616 ocfs2_disable_quotas(osb);
1617
1306 ocfs2_shutdown_local_alloc(osb); 1618 ocfs2_shutdown_local_alloc(osb);
1307 1619
1308 ocfs2_truncate_log_shutdown(osb); 1620 ocfs2_truncate_log_shutdown(osb);
@@ -1413,6 +1725,8 @@ static int ocfs2_initialize_super(struct super_block *sb,
1413 sb->s_fs_info = osb; 1725 sb->s_fs_info = osb;
1414 sb->s_op = &ocfs2_sops; 1726 sb->s_op = &ocfs2_sops;
1415 sb->s_export_op = &ocfs2_export_ops; 1727 sb->s_export_op = &ocfs2_export_ops;
1728 sb->s_qcop = &ocfs2_quotactl_ops;
1729 sb->dq_op = &ocfs2_quota_operations;
1416 sb->s_xattr = ocfs2_xattr_handlers; 1730 sb->s_xattr = ocfs2_xattr_handlers;
1417 sb->s_time_gran = 1; 1731 sb->s_time_gran = 1;
1418 sb->s_flags |= MS_NOATIME; 1732 sb->s_flags |= MS_NOATIME;
@@ -1573,6 +1887,9 @@ static int ocfs2_initialize_super(struct super_block *sb,
1573 INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery); 1887 INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery);
1574 journal->j_state = OCFS2_JOURNAL_FREE; 1888 journal->j_state = OCFS2_JOURNAL_FREE;
1575 1889
1890 INIT_WORK(&osb->dentry_lock_work, ocfs2_drop_dl_inodes);
1891 osb->dentry_lock_list = NULL;
1892
1576 /* get some pseudo constants for clustersize bits */ 1893 /* get some pseudo constants for clustersize bits */
1577 osb->s_clustersize_bits = 1894 osb->s_clustersize_bits =
1578 le32_to_cpu(di->id2.i_super.s_clustersize_bits); 1895 le32_to_cpu(di->id2.i_super.s_clustersize_bits);
@@ -1676,6 +1993,15 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di,
1676 1993
1677 if (memcmp(di->i_signature, OCFS2_SUPER_BLOCK_SIGNATURE, 1994 if (memcmp(di->i_signature, OCFS2_SUPER_BLOCK_SIGNATURE,
1678 strlen(OCFS2_SUPER_BLOCK_SIGNATURE)) == 0) { 1995 strlen(OCFS2_SUPER_BLOCK_SIGNATURE)) == 0) {
1996 /* We have to do a raw check of the feature here */
1997 if (le32_to_cpu(di->id2.i_super.s_feature_incompat) &
1998 OCFS2_FEATURE_INCOMPAT_META_ECC) {
1999 status = ocfs2_block_check_validate(bh->b_data,
2000 bh->b_size,
2001 &di->i_check);
2002 if (status)
2003 goto out;
2004 }
1679 status = -EINVAL; 2005 status = -EINVAL;
1680 if ((1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits)) != blksz) { 2006 if ((1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits)) != blksz) {
1681 mlog(ML_ERROR, "found superblock with incorrect block " 2007 mlog(ML_ERROR, "found superblock with incorrect block "
@@ -1717,6 +2043,7 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di,
1717 } 2043 }
1718 } 2044 }
1719 2045
2046out:
1720 mlog_exit(status); 2047 mlog_exit(status);
1721 return status; 2048 return status;
1722} 2049}
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index cbd03dfdc7b9..ed0a0cfd68d2 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -84,7 +84,7 @@ static char *ocfs2_fast_symlink_getlink(struct inode *inode,
84 84
85 mlog_entry_void(); 85 mlog_entry_void();
86 86
87 status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, bh); 87 status = ocfs2_read_inode_block(inode, bh);
88 if (status < 0) { 88 if (status < 0) {
89 mlog_errno(status); 89 mlog_errno(status);
90 link = ERR_PTR(status); 90 link = ERR_PTR(status);
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 74d7367ade13..915039fffe6e 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -35,12 +35,14 @@
35#include <linux/init.h> 35#include <linux/init.h>
36#include <linux/module.h> 36#include <linux/module.h>
37#include <linux/string.h> 37#include <linux/string.h>
38#include <linux/security.h>
38 39
39#define MLOG_MASK_PREFIX ML_XATTR 40#define MLOG_MASK_PREFIX ML_XATTR
40#include <cluster/masklog.h> 41#include <cluster/masklog.h>
41 42
42#include "ocfs2.h" 43#include "ocfs2.h"
43#include "alloc.h" 44#include "alloc.h"
45#include "blockcheck.h"
44#include "dlmglue.h" 46#include "dlmglue.h"
45#include "file.h" 47#include "file.h"
46#include "symlink.h" 48#include "symlink.h"
@@ -61,12 +63,32 @@ struct ocfs2_xattr_def_value_root {
61}; 63};
62 64
63struct ocfs2_xattr_bucket { 65struct ocfs2_xattr_bucket {
64 struct buffer_head *bhs[OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET]; 66 /* The inode these xattrs are associated with */
65 struct ocfs2_xattr_header *xh; 67 struct inode *bu_inode;
68
69 /* The actual buffers that make up the bucket */
70 struct buffer_head *bu_bhs[OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET];
71
72 /* How many blocks make up one bucket for this filesystem */
73 int bu_blocks;
74};
75
76struct ocfs2_xattr_set_ctxt {
77 handle_t *handle;
78 struct ocfs2_alloc_context *meta_ac;
79 struct ocfs2_alloc_context *data_ac;
80 struct ocfs2_cached_dealloc_ctxt dealloc;
66}; 81};
67 82
68#define OCFS2_XATTR_ROOT_SIZE (sizeof(struct ocfs2_xattr_def_value_root)) 83#define OCFS2_XATTR_ROOT_SIZE (sizeof(struct ocfs2_xattr_def_value_root))
69#define OCFS2_XATTR_INLINE_SIZE 80 84#define OCFS2_XATTR_INLINE_SIZE 80
85#define OCFS2_XATTR_FREE_IN_IBODY (OCFS2_MIN_XATTR_INLINE_SIZE \
86 - sizeof(struct ocfs2_xattr_header) \
87 - sizeof(__u32))
88#define OCFS2_XATTR_FREE_IN_BLOCK(ptr) ((ptr)->i_sb->s_blocksize \
89 - sizeof(struct ocfs2_xattr_block) \
90 - sizeof(struct ocfs2_xattr_header) \
91 - sizeof(__u32))
70 92
71static struct ocfs2_xattr_def_value_root def_xv = { 93static struct ocfs2_xattr_def_value_root def_xv = {
72 .xv.xr_list.l_count = cpu_to_le16(1), 94 .xv.xr_list.l_count = cpu_to_le16(1),
@@ -74,13 +96,25 @@ static struct ocfs2_xattr_def_value_root def_xv = {
74 96
75struct xattr_handler *ocfs2_xattr_handlers[] = { 97struct xattr_handler *ocfs2_xattr_handlers[] = {
76 &ocfs2_xattr_user_handler, 98 &ocfs2_xattr_user_handler,
99#ifdef CONFIG_OCFS2_FS_POSIX_ACL
100 &ocfs2_xattr_acl_access_handler,
101 &ocfs2_xattr_acl_default_handler,
102#endif
77 &ocfs2_xattr_trusted_handler, 103 &ocfs2_xattr_trusted_handler,
104 &ocfs2_xattr_security_handler,
78 NULL 105 NULL
79}; 106};
80 107
81static struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = { 108static struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = {
82 [OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler, 109 [OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler,
110#ifdef CONFIG_OCFS2_FS_POSIX_ACL
111 [OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS]
112 = &ocfs2_xattr_acl_access_handler,
113 [OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT]
114 = &ocfs2_xattr_acl_default_handler,
115#endif
83 [OCFS2_XATTR_INDEX_TRUSTED] = &ocfs2_xattr_trusted_handler, 116 [OCFS2_XATTR_INDEX_TRUSTED] = &ocfs2_xattr_trusted_handler,
117 [OCFS2_XATTR_INDEX_SECURITY] = &ocfs2_xattr_security_handler,
84}; 118};
85 119
86struct ocfs2_xattr_info { 120struct ocfs2_xattr_info {
@@ -98,7 +132,7 @@ struct ocfs2_xattr_search {
98 */ 132 */
99 struct buffer_head *xattr_bh; 133 struct buffer_head *xattr_bh;
100 struct ocfs2_xattr_header *header; 134 struct ocfs2_xattr_header *header;
101 struct ocfs2_xattr_bucket bucket; 135 struct ocfs2_xattr_bucket *bucket;
102 void *base; 136 void *base;
103 void *end; 137 void *end;
104 struct ocfs2_xattr_entry *here; 138 struct ocfs2_xattr_entry *here;
@@ -127,14 +161,20 @@ static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
127 size_t buffer_size); 161 size_t buffer_size);
128 162
129static int ocfs2_xattr_create_index_block(struct inode *inode, 163static int ocfs2_xattr_create_index_block(struct inode *inode,
130 struct ocfs2_xattr_search *xs); 164 struct ocfs2_xattr_search *xs,
165 struct ocfs2_xattr_set_ctxt *ctxt);
131 166
132static int ocfs2_xattr_set_entry_index_block(struct inode *inode, 167static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
133 struct ocfs2_xattr_info *xi, 168 struct ocfs2_xattr_info *xi,
134 struct ocfs2_xattr_search *xs); 169 struct ocfs2_xattr_search *xs,
170 struct ocfs2_xattr_set_ctxt *ctxt);
135 171
136static int ocfs2_delete_xattr_index_block(struct inode *inode, 172static int ocfs2_delete_xattr_index_block(struct inode *inode,
137 struct buffer_head *xb_bh); 173 struct buffer_head *xb_bh);
174static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle,
175 u64 src_blk, u64 last_blk, u64 to_blk,
176 unsigned int start_bucket,
177 u32 *first_hash);
138 178
139static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb) 179static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb)
140{ 180{
@@ -154,6 +194,216 @@ static inline u16 ocfs2_xattr_max_xe_in_bucket(struct super_block *sb)
154 return len / sizeof(struct ocfs2_xattr_entry); 194 return len / sizeof(struct ocfs2_xattr_entry);
155} 195}
156 196
197#define bucket_blkno(_b) ((_b)->bu_bhs[0]->b_blocknr)
198#define bucket_block(_b, _n) ((_b)->bu_bhs[(_n)]->b_data)
199#define bucket_xh(_b) ((struct ocfs2_xattr_header *)bucket_block((_b), 0))
200
201static struct ocfs2_xattr_bucket *ocfs2_xattr_bucket_new(struct inode *inode)
202{
203 struct ocfs2_xattr_bucket *bucket;
204 int blks = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
205
206 BUG_ON(blks > OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET);
207
208 bucket = kzalloc(sizeof(struct ocfs2_xattr_bucket), GFP_NOFS);
209 if (bucket) {
210 bucket->bu_inode = inode;
211 bucket->bu_blocks = blks;
212 }
213
214 return bucket;
215}
216
217static void ocfs2_xattr_bucket_relse(struct ocfs2_xattr_bucket *bucket)
218{
219 int i;
220
221 for (i = 0; i < bucket->bu_blocks; i++) {
222 brelse(bucket->bu_bhs[i]);
223 bucket->bu_bhs[i] = NULL;
224 }
225}
226
227static void ocfs2_xattr_bucket_free(struct ocfs2_xattr_bucket *bucket)
228{
229 if (bucket) {
230 ocfs2_xattr_bucket_relse(bucket);
231 bucket->bu_inode = NULL;
232 kfree(bucket);
233 }
234}
235
236/*
237 * A bucket that has never been written to disk doesn't need to be
238 * read. We just need the buffer_heads. Don't call this for
239 * buckets that are already on disk. ocfs2_read_xattr_bucket() initializes
240 * them fully.
241 */
242static int ocfs2_init_xattr_bucket(struct ocfs2_xattr_bucket *bucket,
243 u64 xb_blkno)
244{
245 int i, rc = 0;
246
247 for (i = 0; i < bucket->bu_blocks; i++) {
248 bucket->bu_bhs[i] = sb_getblk(bucket->bu_inode->i_sb,
249 xb_blkno + i);
250 if (!bucket->bu_bhs[i]) {
251 rc = -EIO;
252 mlog_errno(rc);
253 break;
254 }
255
256 if (!ocfs2_buffer_uptodate(bucket->bu_inode,
257 bucket->bu_bhs[i]))
258 ocfs2_set_new_buffer_uptodate(bucket->bu_inode,
259 bucket->bu_bhs[i]);
260 }
261
262 if (rc)
263 ocfs2_xattr_bucket_relse(bucket);
264 return rc;
265}
266
267/* Read the xattr bucket at xb_blkno */
268static int ocfs2_read_xattr_bucket(struct ocfs2_xattr_bucket *bucket,
269 u64 xb_blkno)
270{
271 int rc;
272
273 rc = ocfs2_read_blocks(bucket->bu_inode, xb_blkno,
274 bucket->bu_blocks, bucket->bu_bhs, 0,
275 NULL);
276 if (!rc) {
277 rc = ocfs2_validate_meta_ecc_bhs(bucket->bu_inode->i_sb,
278 bucket->bu_bhs,
279 bucket->bu_blocks,
280 &bucket_xh(bucket)->xh_check);
281 if (rc)
282 mlog_errno(rc);
283 }
284
285 if (rc)
286 ocfs2_xattr_bucket_relse(bucket);
287 return rc;
288}
289
290static int ocfs2_xattr_bucket_journal_access(handle_t *handle,
291 struct ocfs2_xattr_bucket *bucket,
292 int type)
293{
294 int i, rc = 0;
295
296 for (i = 0; i < bucket->bu_blocks; i++) {
297 rc = ocfs2_journal_access(handle, bucket->bu_inode,
298 bucket->bu_bhs[i], type);
299 if (rc) {
300 mlog_errno(rc);
301 break;
302 }
303 }
304
305 return rc;
306}
307
308static void ocfs2_xattr_bucket_journal_dirty(handle_t *handle,
309 struct ocfs2_xattr_bucket *bucket)
310{
311 int i;
312
313 ocfs2_compute_meta_ecc_bhs(bucket->bu_inode->i_sb,
314 bucket->bu_bhs, bucket->bu_blocks,
315 &bucket_xh(bucket)->xh_check);
316
317 for (i = 0; i < bucket->bu_blocks; i++)
318 ocfs2_journal_dirty(handle, bucket->bu_bhs[i]);
319}
320
321static void ocfs2_xattr_bucket_copy_data(struct ocfs2_xattr_bucket *dest,
322 struct ocfs2_xattr_bucket *src)
323{
324 int i;
325 int blocksize = src->bu_inode->i_sb->s_blocksize;
326
327 BUG_ON(dest->bu_blocks != src->bu_blocks);
328 BUG_ON(dest->bu_inode != src->bu_inode);
329
330 for (i = 0; i < src->bu_blocks; i++) {
331 memcpy(bucket_block(dest, i), bucket_block(src, i),
332 blocksize);
333 }
334}
335
336static int ocfs2_validate_xattr_block(struct super_block *sb,
337 struct buffer_head *bh)
338{
339 int rc;
340 struct ocfs2_xattr_block *xb =
341 (struct ocfs2_xattr_block *)bh->b_data;
342
343 mlog(0, "Validating xattr block %llu\n",
344 (unsigned long long)bh->b_blocknr);
345
346 BUG_ON(!buffer_uptodate(bh));
347
348 /*
349 * If the ecc fails, we return the error but otherwise
350 * leave the filesystem running. We know any error is
351 * local to this block.
352 */
353 rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &xb->xb_check);
354 if (rc)
355 return rc;
356
357 /*
358 * Errors after here are fatal
359 */
360
361 if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
362 ocfs2_error(sb,
363 "Extended attribute block #%llu has bad "
364 "signature %.*s",
365 (unsigned long long)bh->b_blocknr, 7,
366 xb->xb_signature);
367 return -EINVAL;
368 }
369
370 if (le64_to_cpu(xb->xb_blkno) != bh->b_blocknr) {
371 ocfs2_error(sb,
372 "Extended attribute block #%llu has an "
373 "invalid xb_blkno of %llu",
374 (unsigned long long)bh->b_blocknr,
375 (unsigned long long)le64_to_cpu(xb->xb_blkno));
376 return -EINVAL;
377 }
378
379 if (le32_to_cpu(xb->xb_fs_generation) != OCFS2_SB(sb)->fs_generation) {
380 ocfs2_error(sb,
381 "Extended attribute block #%llu has an invalid "
382 "xb_fs_generation of #%u",
383 (unsigned long long)bh->b_blocknr,
384 le32_to_cpu(xb->xb_fs_generation));
385 return -EINVAL;
386 }
387
388 return 0;
389}
390
391static int ocfs2_read_xattr_block(struct inode *inode, u64 xb_blkno,
392 struct buffer_head **bh)
393{
394 int rc;
395 struct buffer_head *tmp = *bh;
396
397 rc = ocfs2_read_block(inode, xb_blkno, &tmp,
398 ocfs2_validate_xattr_block);
399
400 /* If ocfs2_read_block() got us a new bh, pass it up. */
401 if (!rc && !*bh)
402 *bh = tmp;
403
404 return rc;
405}
406
157static inline const char *ocfs2_xattr_prefix(int name_index) 407static inline const char *ocfs2_xattr_prefix(int name_index)
158{ 408{
159 struct xattr_handler *handler = NULL; 409 struct xattr_handler *handler = NULL;
@@ -200,54 +450,163 @@ static void ocfs2_xattr_hash_entry(struct inode *inode,
200 return; 450 return;
201} 451}
202 452
453static int ocfs2_xattr_entry_real_size(int name_len, size_t value_len)
454{
455 int size = 0;
456
457 if (value_len <= OCFS2_XATTR_INLINE_SIZE)
458 size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(value_len);
459 else
460 size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
461 size += sizeof(struct ocfs2_xattr_entry);
462
463 return size;
464}
465
466int ocfs2_calc_security_init(struct inode *dir,
467 struct ocfs2_security_xattr_info *si,
468 int *want_clusters,
469 int *xattr_credits,
470 struct ocfs2_alloc_context **xattr_ac)
471{
472 int ret = 0;
473 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
474 int s_size = ocfs2_xattr_entry_real_size(strlen(si->name),
475 si->value_len);
476
477 /*
478 * The max space of security xattr taken inline is
479 * 256(name) + 80(value) + 16(entry) = 352 bytes,
480 * So reserve one metadata block for it is ok.
481 */
482 if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE ||
483 s_size > OCFS2_XATTR_FREE_IN_IBODY) {
484 ret = ocfs2_reserve_new_metadata_blocks(osb, 1, xattr_ac);
485 if (ret) {
486 mlog_errno(ret);
487 return ret;
488 }
489 *xattr_credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS;
490 }
491
492 /* reserve clusters for xattr value which will be set in B tree*/
493 if (si->value_len > OCFS2_XATTR_INLINE_SIZE) {
494 int new_clusters = ocfs2_clusters_for_bytes(dir->i_sb,
495 si->value_len);
496
497 *xattr_credits += ocfs2_clusters_to_blocks(dir->i_sb,
498 new_clusters);
499 *want_clusters += new_clusters;
500 }
501 return ret;
502}
503
504int ocfs2_calc_xattr_init(struct inode *dir,
505 struct buffer_head *dir_bh,
506 int mode,
507 struct ocfs2_security_xattr_info *si,
508 int *want_clusters,
509 int *xattr_credits,
510 struct ocfs2_alloc_context **xattr_ac)
511{
512 int ret = 0;
513 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
514 int s_size = 0, a_size = 0, acl_len = 0, new_clusters;
515
516 if (si->enable)
517 s_size = ocfs2_xattr_entry_real_size(strlen(si->name),
518 si->value_len);
519
520 if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) {
521 acl_len = ocfs2_xattr_get_nolock(dir, dir_bh,
522 OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT,
523 "", NULL, 0);
524 if (acl_len > 0) {
525 a_size = ocfs2_xattr_entry_real_size(0, acl_len);
526 if (S_ISDIR(mode))
527 a_size <<= 1;
528 } else if (acl_len != 0 && acl_len != -ENODATA) {
529 mlog_errno(ret);
530 return ret;
531 }
532 }
533
534 if (!(s_size + a_size))
535 return ret;
536
537 /*
538 * The max space of security xattr taken inline is
539 * 256(name) + 80(value) + 16(entry) = 352 bytes,
540 * The max space of acl xattr taken inline is
541 * 80(value) + 16(entry) * 2(if directory) = 192 bytes,
542 * when blocksize = 512, may reserve one more cluser for
543 * xattr bucket, otherwise reserve one metadata block
544 * for them is ok.
545 */
546 if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE ||
547 (s_size + a_size) > OCFS2_XATTR_FREE_IN_IBODY) {
548 ret = ocfs2_reserve_new_metadata_blocks(osb, 1, xattr_ac);
549 if (ret) {
550 mlog_errno(ret);
551 return ret;
552 }
553 *xattr_credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS;
554 }
555
556 if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE &&
557 (s_size + a_size) > OCFS2_XATTR_FREE_IN_BLOCK(dir)) {
558 *want_clusters += 1;
559 *xattr_credits += ocfs2_blocks_per_xattr_bucket(dir->i_sb);
560 }
561
562 /*
563 * reserve credits and clusters for xattrs which has large value
564 * and have to be set outside
565 */
566 if (si->enable && si->value_len > OCFS2_XATTR_INLINE_SIZE) {
567 new_clusters = ocfs2_clusters_for_bytes(dir->i_sb,
568 si->value_len);
569 *xattr_credits += ocfs2_clusters_to_blocks(dir->i_sb,
570 new_clusters);
571 *want_clusters += new_clusters;
572 }
573 if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL &&
574 acl_len > OCFS2_XATTR_INLINE_SIZE) {
575 /* for directory, it has DEFAULT and ACCESS two types of acls */
576 new_clusters = (S_ISDIR(mode) ? 2 : 1) *
577 ocfs2_clusters_for_bytes(dir->i_sb, acl_len);
578 *xattr_credits += ocfs2_clusters_to_blocks(dir->i_sb,
579 new_clusters);
580 *want_clusters += new_clusters;
581 }
582
583 return ret;
584}
585
203static int ocfs2_xattr_extend_allocation(struct inode *inode, 586static int ocfs2_xattr_extend_allocation(struct inode *inode,
204 u32 clusters_to_add, 587 u32 clusters_to_add,
205 struct buffer_head *xattr_bh, 588 struct ocfs2_xattr_value_buf *vb,
206 struct ocfs2_xattr_value_root *xv) 589 struct ocfs2_xattr_set_ctxt *ctxt)
207{ 590{
208 int status = 0; 591 int status = 0;
209 int restart_func = 0; 592 handle_t *handle = ctxt->handle;
210 int credits = 0;
211 handle_t *handle = NULL;
212 struct ocfs2_alloc_context *data_ac = NULL;
213 struct ocfs2_alloc_context *meta_ac = NULL;
214 enum ocfs2_alloc_restarted why; 593 enum ocfs2_alloc_restarted why;
215 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 594 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
216 u32 prev_clusters, logical_start = le32_to_cpu(xv->xr_clusters); 595 u32 prev_clusters, logical_start = le32_to_cpu(vb->vb_xv->xr_clusters);
217 struct ocfs2_extent_tree et; 596 struct ocfs2_extent_tree et;
218 597
219 mlog(0, "(clusters_to_add for xattr= %u)\n", clusters_to_add); 598 mlog(0, "(clusters_to_add for xattr= %u)\n", clusters_to_add);
220 599
221 ocfs2_init_xattr_value_extent_tree(&et, inode, xattr_bh, xv); 600 ocfs2_init_xattr_value_extent_tree(&et, inode, vb);
222
223restart_all:
224
225 status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
226 &data_ac, &meta_ac);
227 if (status) {
228 mlog_errno(status);
229 goto leave;
230 }
231
232 credits = ocfs2_calc_extend_credits(osb->sb, et.et_root_el,
233 clusters_to_add);
234 handle = ocfs2_start_trans(osb, credits);
235 if (IS_ERR(handle)) {
236 status = PTR_ERR(handle);
237 handle = NULL;
238 mlog_errno(status);
239 goto leave;
240 }
241 601
242restarted_transaction: 602 status = vb->vb_access(handle, inode, vb->vb_bh,
243 status = ocfs2_journal_access(handle, inode, xattr_bh, 603 OCFS2_JOURNAL_ACCESS_WRITE);
244 OCFS2_JOURNAL_ACCESS_WRITE);
245 if (status < 0) { 604 if (status < 0) {
246 mlog_errno(status); 605 mlog_errno(status);
247 goto leave; 606 goto leave;
248 } 607 }
249 608
250 prev_clusters = le32_to_cpu(xv->xr_clusters); 609 prev_clusters = le32_to_cpu(vb->vb_xv->xr_clusters);
251 status = ocfs2_add_clusters_in_btree(osb, 610 status = ocfs2_add_clusters_in_btree(osb,
252 inode, 611 inode,
253 &logical_start, 612 &logical_start,
@@ -255,157 +614,84 @@ restarted_transaction:
255 0, 614 0,
256 &et, 615 &et,
257 handle, 616 handle,
258 data_ac, 617 ctxt->data_ac,
259 meta_ac, 618 ctxt->meta_ac,
260 &why); 619 &why);
261 if ((status < 0) && (status != -EAGAIN)) { 620 if (status < 0) {
262 if (status != -ENOSPC) 621 mlog_errno(status);
263 mlog_errno(status);
264 goto leave; 622 goto leave;
265 } 623 }
266 624
267 status = ocfs2_journal_dirty(handle, xattr_bh); 625 status = ocfs2_journal_dirty(handle, vb->vb_bh);
268 if (status < 0) { 626 if (status < 0) {
269 mlog_errno(status); 627 mlog_errno(status);
270 goto leave; 628 goto leave;
271 } 629 }
272 630
273 clusters_to_add -= le32_to_cpu(xv->xr_clusters) - prev_clusters; 631 clusters_to_add -= le32_to_cpu(vb->vb_xv->xr_clusters) - prev_clusters;
274 632
275 if (why != RESTART_NONE && clusters_to_add) { 633 /*
276 if (why == RESTART_META) { 634 * We should have already allocated enough space before the transaction,
277 mlog(0, "restarting function.\n"); 635 * so no need to restart.
278 restart_func = 1; 636 */
279 } else { 637 BUG_ON(why != RESTART_NONE || clusters_to_add);
280 BUG_ON(why != RESTART_TRANS);
281
282 mlog(0, "restarting transaction.\n");
283 /* TODO: This can be more intelligent. */
284 credits = ocfs2_calc_extend_credits(osb->sb,
285 et.et_root_el,
286 clusters_to_add);
287 status = ocfs2_extend_trans(handle, credits);
288 if (status < 0) {
289 /* handle still has to be committed at
290 * this point. */
291 status = -ENOMEM;
292 mlog_errno(status);
293 goto leave;
294 }
295 goto restarted_transaction;
296 }
297 }
298 638
299leave: 639leave:
300 if (handle) {
301 ocfs2_commit_trans(osb, handle);
302 handle = NULL;
303 }
304 if (data_ac) {
305 ocfs2_free_alloc_context(data_ac);
306 data_ac = NULL;
307 }
308 if (meta_ac) {
309 ocfs2_free_alloc_context(meta_ac);
310 meta_ac = NULL;
311 }
312 if ((!status) && restart_func) {
313 restart_func = 0;
314 goto restart_all;
315 }
316 640
317 return status; 641 return status;
318} 642}
319 643
320static int __ocfs2_remove_xattr_range(struct inode *inode, 644static int __ocfs2_remove_xattr_range(struct inode *inode,
321 struct buffer_head *root_bh, 645 struct ocfs2_xattr_value_buf *vb,
322 struct ocfs2_xattr_value_root *xv,
323 u32 cpos, u32 phys_cpos, u32 len, 646 u32 cpos, u32 phys_cpos, u32 len,
324 struct ocfs2_cached_dealloc_ctxt *dealloc) 647 struct ocfs2_xattr_set_ctxt *ctxt)
325{ 648{
326 int ret; 649 int ret;
327 u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); 650 u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
328 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 651 handle_t *handle = ctxt->handle;
329 struct inode *tl_inode = osb->osb_tl_inode;
330 handle_t *handle;
331 struct ocfs2_alloc_context *meta_ac = NULL;
332 struct ocfs2_extent_tree et; 652 struct ocfs2_extent_tree et;
333 653
334 ocfs2_init_xattr_value_extent_tree(&et, inode, root_bh, xv); 654 ocfs2_init_xattr_value_extent_tree(&et, inode, vb);
335 655
336 ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac); 656 ret = vb->vb_access(handle, inode, vb->vb_bh,
657 OCFS2_JOURNAL_ACCESS_WRITE);
337 if (ret) { 658 if (ret) {
338 mlog_errno(ret); 659 mlog_errno(ret);
339 return ret;
340 }
341
342 mutex_lock(&tl_inode->i_mutex);
343
344 if (ocfs2_truncate_log_needs_flush(osb)) {
345 ret = __ocfs2_flush_truncate_log(osb);
346 if (ret < 0) {
347 mlog_errno(ret);
348 goto out;
349 }
350 }
351
352 handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
353 if (IS_ERR(handle)) {
354 ret = PTR_ERR(handle);
355 mlog_errno(ret);
356 goto out; 660 goto out;
357 } 661 }
358 662
359 ret = ocfs2_journal_access(handle, inode, root_bh, 663 ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, ctxt->meta_ac,
360 OCFS2_JOURNAL_ACCESS_WRITE); 664 &ctxt->dealloc);
361 if (ret) {
362 mlog_errno(ret);
363 goto out_commit;
364 }
365
366 ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac,
367 dealloc);
368 if (ret) { 665 if (ret) {
369 mlog_errno(ret); 666 mlog_errno(ret);
370 goto out_commit; 667 goto out;
371 } 668 }
372 669
373 le32_add_cpu(&xv->xr_clusters, -len); 670 le32_add_cpu(&vb->vb_xv->xr_clusters, -len);
374 671
375 ret = ocfs2_journal_dirty(handle, root_bh); 672 ret = ocfs2_journal_dirty(handle, vb->vb_bh);
376 if (ret) { 673 if (ret) {
377 mlog_errno(ret); 674 mlog_errno(ret);
378 goto out_commit; 675 goto out;
379 } 676 }
380 677
381 ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len); 678 ret = ocfs2_cache_cluster_dealloc(&ctxt->dealloc, phys_blkno, len);
382 if (ret) 679 if (ret)
383 mlog_errno(ret); 680 mlog_errno(ret);
384 681
385out_commit:
386 ocfs2_commit_trans(osb, handle);
387out: 682out:
388 mutex_unlock(&tl_inode->i_mutex);
389
390 if (meta_ac)
391 ocfs2_free_alloc_context(meta_ac);
392
393 return ret; 683 return ret;
394} 684}
395 685
396static int ocfs2_xattr_shrink_size(struct inode *inode, 686static int ocfs2_xattr_shrink_size(struct inode *inode,
397 u32 old_clusters, 687 u32 old_clusters,
398 u32 new_clusters, 688 u32 new_clusters,
399 struct buffer_head *root_bh, 689 struct ocfs2_xattr_value_buf *vb,
400 struct ocfs2_xattr_value_root *xv) 690 struct ocfs2_xattr_set_ctxt *ctxt)
401{ 691{
402 int ret = 0; 692 int ret = 0;
403 u32 trunc_len, cpos, phys_cpos, alloc_size; 693 u32 trunc_len, cpos, phys_cpos, alloc_size;
404 u64 block; 694 u64 block;
405 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
406 struct ocfs2_cached_dealloc_ctxt dealloc;
407
408 ocfs2_init_dealloc_ctxt(&dealloc);
409 695
410 if (old_clusters <= new_clusters) 696 if (old_clusters <= new_clusters)
411 return 0; 697 return 0;
@@ -414,7 +700,8 @@ static int ocfs2_xattr_shrink_size(struct inode *inode,
414 trunc_len = old_clusters - new_clusters; 700 trunc_len = old_clusters - new_clusters;
415 while (trunc_len) { 701 while (trunc_len) {
416 ret = ocfs2_xattr_get_clusters(inode, cpos, &phys_cpos, 702 ret = ocfs2_xattr_get_clusters(inode, cpos, &phys_cpos,
417 &alloc_size, &xv->xr_list); 703 &alloc_size,
704 &vb->vb_xv->xr_list);
418 if (ret) { 705 if (ret) {
419 mlog_errno(ret); 706 mlog_errno(ret);
420 goto out; 707 goto out;
@@ -423,9 +710,9 @@ static int ocfs2_xattr_shrink_size(struct inode *inode,
423 if (alloc_size > trunc_len) 710 if (alloc_size > trunc_len)
424 alloc_size = trunc_len; 711 alloc_size = trunc_len;
425 712
426 ret = __ocfs2_remove_xattr_range(inode, root_bh, xv, cpos, 713 ret = __ocfs2_remove_xattr_range(inode, vb, cpos,
427 phys_cpos, alloc_size, 714 phys_cpos, alloc_size,
428 &dealloc); 715 ctxt);
429 if (ret) { 716 if (ret) {
430 mlog_errno(ret); 717 mlog_errno(ret);
431 goto out; 718 goto out;
@@ -439,20 +726,17 @@ static int ocfs2_xattr_shrink_size(struct inode *inode,
439 } 726 }
440 727
441out: 728out:
442 ocfs2_schedule_truncate_log_flush(osb, 1);
443 ocfs2_run_deallocs(osb, &dealloc);
444
445 return ret; 729 return ret;
446} 730}
447 731
448static int ocfs2_xattr_value_truncate(struct inode *inode, 732static int ocfs2_xattr_value_truncate(struct inode *inode,
449 struct buffer_head *root_bh, 733 struct ocfs2_xattr_value_buf *vb,
450 struct ocfs2_xattr_value_root *xv, 734 int len,
451 int len) 735 struct ocfs2_xattr_set_ctxt *ctxt)
452{ 736{
453 int ret; 737 int ret;
454 u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb, len); 738 u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb, len);
455 u32 old_clusters = le32_to_cpu(xv->xr_clusters); 739 u32 old_clusters = le32_to_cpu(vb->vb_xv->xr_clusters);
456 740
457 if (new_clusters == old_clusters) 741 if (new_clusters == old_clusters)
458 return 0; 742 return 0;
@@ -460,11 +744,11 @@ static int ocfs2_xattr_value_truncate(struct inode *inode,
460 if (new_clusters > old_clusters) 744 if (new_clusters > old_clusters)
461 ret = ocfs2_xattr_extend_allocation(inode, 745 ret = ocfs2_xattr_extend_allocation(inode,
462 new_clusters - old_clusters, 746 new_clusters - old_clusters,
463 root_bh, xv); 747 vb, ctxt);
464 else 748 else
465 ret = ocfs2_xattr_shrink_size(inode, 749 ret = ocfs2_xattr_shrink_size(inode,
466 old_clusters, new_clusters, 750 old_clusters, new_clusters,
467 root_bh, xv); 751 vb, ctxt);
468 752
469 return ret; 753 return ret;
470} 754}
@@ -554,18 +838,14 @@ static int ocfs2_xattr_block_list(struct inode *inode,
554 if (!di->i_xattr_loc) 838 if (!di->i_xattr_loc)
555 return ret; 839 return ret;
556 840
557 ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh); 841 ret = ocfs2_read_xattr_block(inode, le64_to_cpu(di->i_xattr_loc),
842 &blk_bh);
558 if (ret < 0) { 843 if (ret < 0) {
559 mlog_errno(ret); 844 mlog_errno(ret);
560 return ret; 845 return ret;
561 } 846 }
562 847
563 xb = (struct ocfs2_xattr_block *)blk_bh->b_data; 848 xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
564 if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
565 ret = -EIO;
566 goto cleanup;
567 }
568
569 if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) { 849 if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
570 struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header; 850 struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header;
571 ret = ocfs2_xattr_list_entries(inode, header, 851 ret = ocfs2_xattr_list_entries(inode, header,
@@ -575,7 +855,7 @@ static int ocfs2_xattr_block_list(struct inode *inode,
575 ret = ocfs2_xattr_tree_list_index_block(inode, xt, 855 ret = ocfs2_xattr_tree_list_index_block(inode, xt,
576 buffer, buffer_size); 856 buffer, buffer_size);
577 } 857 }
578cleanup: 858
579 brelse(blk_bh); 859 brelse(blk_bh);
580 860
581 return ret; 861 return ret;
@@ -685,7 +965,7 @@ static int ocfs2_xattr_get_value_outside(struct inode *inode,
685 blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster); 965 blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
686 /* Copy ocfs2_xattr_value */ 966 /* Copy ocfs2_xattr_value */
687 for (i = 0; i < num_clusters * bpc; i++, blkno++) { 967 for (i = 0; i < num_clusters * bpc; i++, blkno++) {
688 ret = ocfs2_read_block(inode, blkno, &bh); 968 ret = ocfs2_read_block(inode, blkno, &bh, NULL);
689 if (ret) { 969 if (ret) {
690 mlog_errno(ret); 970 mlog_errno(ret);
691 goto out; 971 goto out;
@@ -769,7 +1049,12 @@ static int ocfs2_xattr_block_get(struct inode *inode,
769 size_t size; 1049 size_t size;
770 int ret = -ENODATA, name_offset, name_len, block_off, i; 1050 int ret = -ENODATA, name_offset, name_len, block_off, i;
771 1051
772 memset(&xs->bucket, 0, sizeof(xs->bucket)); 1052 xs->bucket = ocfs2_xattr_bucket_new(inode);
1053 if (!xs->bucket) {
1054 ret = -ENOMEM;
1055 mlog_errno(ret);
1056 goto cleanup;
1057 }
773 1058
774 ret = ocfs2_xattr_block_find(inode, name_index, name, xs); 1059 ret = ocfs2_xattr_block_find(inode, name_index, name, xs);
775 if (ret) { 1060 if (ret) {
@@ -795,11 +1080,11 @@ static int ocfs2_xattr_block_get(struct inode *inode,
795 1080
796 if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) { 1081 if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
797 ret = ocfs2_xattr_bucket_get_name_value(inode, 1082 ret = ocfs2_xattr_bucket_get_name_value(inode,
798 xs->bucket.xh, 1083 bucket_xh(xs->bucket),
799 i, 1084 i,
800 &block_off, 1085 &block_off,
801 &name_offset); 1086 &name_offset);
802 xs->base = xs->bucket.bhs[block_off]->b_data; 1087 xs->base = bucket_block(xs->bucket, block_off);
803 } 1088 }
804 if (ocfs2_xattr_is_local(xs->here)) { 1089 if (ocfs2_xattr_is_local(xs->here)) {
805 memcpy(buffer, (void *)xs->base + 1090 memcpy(buffer, (void *)xs->base +
@@ -817,21 +1102,15 @@ static int ocfs2_xattr_block_get(struct inode *inode,
817 } 1102 }
818 ret = size; 1103 ret = size;
819cleanup: 1104cleanup:
820 for (i = 0; i < OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET; i++) 1105 ocfs2_xattr_bucket_free(xs->bucket);
821 brelse(xs->bucket.bhs[i]);
822 memset(&xs->bucket, 0, sizeof(xs->bucket));
823 1106
824 brelse(xs->xattr_bh); 1107 brelse(xs->xattr_bh);
825 xs->xattr_bh = NULL; 1108 xs->xattr_bh = NULL;
826 return ret; 1109 return ret;
827} 1110}
828 1111
829/* ocfs2_xattr_get() 1112int ocfs2_xattr_get_nolock(struct inode *inode,
830 * 1113 struct buffer_head *di_bh,
831 * Copy an extended attribute into the buffer provided.
832 * Buffer is NULL to compute the size of buffer required.
833 */
834static int ocfs2_xattr_get(struct inode *inode,
835 int name_index, 1114 int name_index,
836 const char *name, 1115 const char *name,
837 void *buffer, 1116 void *buffer,
@@ -839,7 +1118,6 @@ static int ocfs2_xattr_get(struct inode *inode,
839{ 1118{
840 int ret; 1119 int ret;
841 struct ocfs2_dinode *di = NULL; 1120 struct ocfs2_dinode *di = NULL;
842 struct buffer_head *di_bh = NULL;
843 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1121 struct ocfs2_inode_info *oi = OCFS2_I(inode);
844 struct ocfs2_xattr_search xis = { 1122 struct ocfs2_xattr_search xis = {
845 .not_found = -ENODATA, 1123 .not_found = -ENODATA,
@@ -854,11 +1132,6 @@ static int ocfs2_xattr_get(struct inode *inode,
854 if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) 1132 if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL))
855 ret = -ENODATA; 1133 ret = -ENODATA;
856 1134
857 ret = ocfs2_inode_lock(inode, &di_bh, 0);
858 if (ret < 0) {
859 mlog_errno(ret);
860 return ret;
861 }
862 xis.inode_bh = xbs.inode_bh = di_bh; 1135 xis.inode_bh = xbs.inode_bh = di_bh;
863 di = (struct ocfs2_dinode *)di_bh->b_data; 1136 di = (struct ocfs2_dinode *)di_bh->b_data;
864 1137
@@ -869,6 +1142,32 @@ static int ocfs2_xattr_get(struct inode *inode,
869 ret = ocfs2_xattr_block_get(inode, name_index, name, buffer, 1142 ret = ocfs2_xattr_block_get(inode, name_index, name, buffer,
870 buffer_size, &xbs); 1143 buffer_size, &xbs);
871 up_read(&oi->ip_xattr_sem); 1144 up_read(&oi->ip_xattr_sem);
1145
1146 return ret;
1147}
1148
1149/* ocfs2_xattr_get()
1150 *
1151 * Copy an extended attribute into the buffer provided.
1152 * Buffer is NULL to compute the size of buffer required.
1153 */
1154static int ocfs2_xattr_get(struct inode *inode,
1155 int name_index,
1156 const char *name,
1157 void *buffer,
1158 size_t buffer_size)
1159{
1160 int ret;
1161 struct buffer_head *di_bh = NULL;
1162
1163 ret = ocfs2_inode_lock(inode, &di_bh, 0);
1164 if (ret < 0) {
1165 mlog_errno(ret);
1166 return ret;
1167 }
1168 ret = ocfs2_xattr_get_nolock(inode, di_bh, name_index,
1169 name, buffer, buffer_size);
1170
872 ocfs2_inode_unlock(inode, 0); 1171 ocfs2_inode_unlock(inode, 0);
873 1172
874 brelse(di_bh); 1173 brelse(di_bh);
@@ -877,44 +1176,36 @@ static int ocfs2_xattr_get(struct inode *inode,
877} 1176}
878 1177
879static int __ocfs2_xattr_set_value_outside(struct inode *inode, 1178static int __ocfs2_xattr_set_value_outside(struct inode *inode,
1179 handle_t *handle,
880 struct ocfs2_xattr_value_root *xv, 1180 struct ocfs2_xattr_value_root *xv,
881 const void *value, 1181 const void *value,
882 int value_len) 1182 int value_len)
883{ 1183{
884 int ret = 0, i, cp_len, credits; 1184 int ret = 0, i, cp_len;
885 u16 blocksize = inode->i_sb->s_blocksize; 1185 u16 blocksize = inode->i_sb->s_blocksize;
886 u32 p_cluster, num_clusters; 1186 u32 p_cluster, num_clusters;
887 u32 cpos = 0, bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); 1187 u32 cpos = 0, bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
888 u32 clusters = ocfs2_clusters_for_bytes(inode->i_sb, value_len); 1188 u32 clusters = ocfs2_clusters_for_bytes(inode->i_sb, value_len);
889 u64 blkno; 1189 u64 blkno;
890 struct buffer_head *bh = NULL; 1190 struct buffer_head *bh = NULL;
891 handle_t *handle;
892 1191
893 BUG_ON(clusters > le32_to_cpu(xv->xr_clusters)); 1192 BUG_ON(clusters > le32_to_cpu(xv->xr_clusters));
894 1193
895 credits = clusters * bpc;
896 handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb), credits);
897 if (IS_ERR(handle)) {
898 ret = PTR_ERR(handle);
899 mlog_errno(ret);
900 goto out;
901 }
902
903 while (cpos < clusters) { 1194 while (cpos < clusters) {
904 ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster, 1195 ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
905 &num_clusters, &xv->xr_list); 1196 &num_clusters, &xv->xr_list);
906 if (ret) { 1197 if (ret) {
907 mlog_errno(ret); 1198 mlog_errno(ret);
908 goto out_commit; 1199 goto out;
909 } 1200 }
910 1201
911 blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster); 1202 blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
912 1203
913 for (i = 0; i < num_clusters * bpc; i++, blkno++) { 1204 for (i = 0; i < num_clusters * bpc; i++, blkno++) {
914 ret = ocfs2_read_block(inode, blkno, &bh); 1205 ret = ocfs2_read_block(inode, blkno, &bh, NULL);
915 if (ret) { 1206 if (ret) {
916 mlog_errno(ret); 1207 mlog_errno(ret);
917 goto out_commit; 1208 goto out;
918 } 1209 }
919 1210
920 ret = ocfs2_journal_access(handle, 1211 ret = ocfs2_journal_access(handle,
@@ -923,7 +1214,7 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode,
923 OCFS2_JOURNAL_ACCESS_WRITE); 1214 OCFS2_JOURNAL_ACCESS_WRITE);
924 if (ret < 0) { 1215 if (ret < 0) {
925 mlog_errno(ret); 1216 mlog_errno(ret);
926 goto out_commit; 1217 goto out;
927 } 1218 }
928 1219
929 cp_len = value_len > blocksize ? blocksize : value_len; 1220 cp_len = value_len > blocksize ? blocksize : value_len;
@@ -937,7 +1228,7 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode,
937 ret = ocfs2_journal_dirty(handle, bh); 1228 ret = ocfs2_journal_dirty(handle, bh);
938 if (ret < 0) { 1229 if (ret < 0) {
939 mlog_errno(ret); 1230 mlog_errno(ret);
940 goto out_commit; 1231 goto out;
941 } 1232 }
942 brelse(bh); 1233 brelse(bh);
943 bh = NULL; 1234 bh = NULL;
@@ -951,8 +1242,6 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode,
951 } 1242 }
952 cpos += num_clusters; 1243 cpos += num_clusters;
953 } 1244 }
954out_commit:
955 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
956out: 1245out:
957 brelse(bh); 1246 brelse(bh);
958 1247
@@ -960,28 +1249,22 @@ out:
960} 1249}
961 1250
962static int ocfs2_xattr_cleanup(struct inode *inode, 1251static int ocfs2_xattr_cleanup(struct inode *inode,
1252 handle_t *handle,
963 struct ocfs2_xattr_info *xi, 1253 struct ocfs2_xattr_info *xi,
964 struct ocfs2_xattr_search *xs, 1254 struct ocfs2_xattr_search *xs,
1255 struct ocfs2_xattr_value_buf *vb,
965 size_t offs) 1256 size_t offs)
966{ 1257{
967 handle_t *handle = NULL;
968 int ret = 0; 1258 int ret = 0;
969 size_t name_len = strlen(xi->name); 1259 size_t name_len = strlen(xi->name);
970 void *val = xs->base + offs; 1260 void *val = xs->base + offs;
971 size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE; 1261 size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
972 1262
973 handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), 1263 ret = vb->vb_access(handle, inode, vb->vb_bh,
974 OCFS2_XATTR_BLOCK_UPDATE_CREDITS); 1264 OCFS2_JOURNAL_ACCESS_WRITE);
975 if (IS_ERR(handle)) {
976 ret = PTR_ERR(handle);
977 mlog_errno(ret);
978 goto out;
979 }
980 ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
981 OCFS2_JOURNAL_ACCESS_WRITE);
982 if (ret) { 1265 if (ret) {
983 mlog_errno(ret); 1266 mlog_errno(ret);
984 goto out_commit; 1267 goto out;
985 } 1268 }
986 /* Decrease xattr count */ 1269 /* Decrease xattr count */
987 le16_add_cpu(&xs->header->xh_count, -1); 1270 le16_add_cpu(&xs->header->xh_count, -1);
@@ -989,35 +1272,27 @@ static int ocfs2_xattr_cleanup(struct inode *inode,
989 memset((void *)xs->here, 0, sizeof(struct ocfs2_xattr_entry)); 1272 memset((void *)xs->here, 0, sizeof(struct ocfs2_xattr_entry));
990 memset(val, 0, size); 1273 memset(val, 0, size);
991 1274
992 ret = ocfs2_journal_dirty(handle, xs->xattr_bh); 1275 ret = ocfs2_journal_dirty(handle, vb->vb_bh);
993 if (ret < 0) 1276 if (ret < 0)
994 mlog_errno(ret); 1277 mlog_errno(ret);
995out_commit:
996 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
997out: 1278out:
998 return ret; 1279 return ret;
999} 1280}
1000 1281
1001static int ocfs2_xattr_update_entry(struct inode *inode, 1282static int ocfs2_xattr_update_entry(struct inode *inode,
1283 handle_t *handle,
1002 struct ocfs2_xattr_info *xi, 1284 struct ocfs2_xattr_info *xi,
1003 struct ocfs2_xattr_search *xs, 1285 struct ocfs2_xattr_search *xs,
1286 struct ocfs2_xattr_value_buf *vb,
1004 size_t offs) 1287 size_t offs)
1005{ 1288{
1006 handle_t *handle = NULL; 1289 int ret;
1007 int ret = 0;
1008 1290
1009 handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), 1291 ret = vb->vb_access(handle, inode, vb->vb_bh,
1010 OCFS2_XATTR_BLOCK_UPDATE_CREDITS); 1292 OCFS2_JOURNAL_ACCESS_WRITE);
1011 if (IS_ERR(handle)) {
1012 ret = PTR_ERR(handle);
1013 mlog_errno(ret);
1014 goto out;
1015 }
1016 ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
1017 OCFS2_JOURNAL_ACCESS_WRITE);
1018 if (ret) { 1293 if (ret) {
1019 mlog_errno(ret); 1294 mlog_errno(ret);
1020 goto out_commit; 1295 goto out;
1021 } 1296 }
1022 1297
1023 xs->here->xe_name_offset = cpu_to_le16(offs); 1298 xs->here->xe_name_offset = cpu_to_le16(offs);
@@ -1028,11 +1303,9 @@ static int ocfs2_xattr_update_entry(struct inode *inode,
1028 ocfs2_xattr_set_local(xs->here, 0); 1303 ocfs2_xattr_set_local(xs->here, 0);
1029 ocfs2_xattr_hash_entry(inode, xs->header, xs->here); 1304 ocfs2_xattr_hash_entry(inode, xs->header, xs->here);
1030 1305
1031 ret = ocfs2_journal_dirty(handle, xs->xattr_bh); 1306 ret = ocfs2_journal_dirty(handle, vb->vb_bh);
1032 if (ret < 0) 1307 if (ret < 0)
1033 mlog_errno(ret); 1308 mlog_errno(ret);
1034out_commit:
1035 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
1036out: 1309out:
1037 return ret; 1310 return ret;
1038} 1311}
@@ -1045,6 +1318,8 @@ out:
1045static int ocfs2_xattr_set_value_outside(struct inode *inode, 1318static int ocfs2_xattr_set_value_outside(struct inode *inode,
1046 struct ocfs2_xattr_info *xi, 1319 struct ocfs2_xattr_info *xi,
1047 struct ocfs2_xattr_search *xs, 1320 struct ocfs2_xattr_search *xs,
1321 struct ocfs2_xattr_set_ctxt *ctxt,
1322 struct ocfs2_xattr_value_buf *vb,
1048 size_t offs) 1323 size_t offs)
1049{ 1324{
1050 size_t name_len = strlen(xi->name); 1325 size_t name_len = strlen(xi->name);
@@ -1062,20 +1337,20 @@ static int ocfs2_xattr_set_value_outside(struct inode *inode,
1062 xv->xr_list.l_tree_depth = 0; 1337 xv->xr_list.l_tree_depth = 0;
1063 xv->xr_list.l_count = cpu_to_le16(1); 1338 xv->xr_list.l_count = cpu_to_le16(1);
1064 xv->xr_list.l_next_free_rec = 0; 1339 xv->xr_list.l_next_free_rec = 0;
1340 vb->vb_xv = xv;
1065 1341
1066 ret = ocfs2_xattr_value_truncate(inode, xs->xattr_bh, xv, 1342 ret = ocfs2_xattr_value_truncate(inode, vb, xi->value_len, ctxt);
1067 xi->value_len);
1068 if (ret < 0) { 1343 if (ret < 0) {
1069 mlog_errno(ret); 1344 mlog_errno(ret);
1070 return ret; 1345 return ret;
1071 } 1346 }
1072 ret = __ocfs2_xattr_set_value_outside(inode, xv, xi->value, 1347 ret = ocfs2_xattr_update_entry(inode, ctxt->handle, xi, xs, vb, offs);
1073 xi->value_len);
1074 if (ret < 0) { 1348 if (ret < 0) {
1075 mlog_errno(ret); 1349 mlog_errno(ret);
1076 return ret; 1350 return ret;
1077 } 1351 }
1078 ret = ocfs2_xattr_update_entry(inode, xi, xs, offs); 1352 ret = __ocfs2_xattr_set_value_outside(inode, ctxt->handle, vb->vb_xv,
1353 xi->value, xi->value_len);
1079 if (ret < 0) 1354 if (ret < 0)
1080 mlog_errno(ret); 1355 mlog_errno(ret);
1081 1356
@@ -1195,6 +1470,7 @@ static void ocfs2_xattr_set_entry_local(struct inode *inode,
1195static int ocfs2_xattr_set_entry(struct inode *inode, 1470static int ocfs2_xattr_set_entry(struct inode *inode,
1196 struct ocfs2_xattr_info *xi, 1471 struct ocfs2_xattr_info *xi,
1197 struct ocfs2_xattr_search *xs, 1472 struct ocfs2_xattr_search *xs,
1473 struct ocfs2_xattr_set_ctxt *ctxt,
1198 int flag) 1474 int flag)
1199{ 1475{
1200 struct ocfs2_xattr_entry *last; 1476 struct ocfs2_xattr_entry *last;
@@ -1202,7 +1478,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
1202 struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; 1478 struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
1203 size_t min_offs = xs->end - xs->base, name_len = strlen(xi->name); 1479 size_t min_offs = xs->end - xs->base, name_len = strlen(xi->name);
1204 size_t size_l = 0; 1480 size_t size_l = 0;
1205 handle_t *handle = NULL; 1481 handle_t *handle = ctxt->handle;
1206 int free, i, ret; 1482 int free, i, ret;
1207 struct ocfs2_xattr_info xi_l = { 1483 struct ocfs2_xattr_info xi_l = {
1208 .name_index = xi->name_index, 1484 .name_index = xi->name_index,
@@ -1210,6 +1486,16 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
1210 .value = xi->value, 1486 .value = xi->value,
1211 .value_len = xi->value_len, 1487 .value_len = xi->value_len,
1212 }; 1488 };
1489 struct ocfs2_xattr_value_buf vb = {
1490 .vb_bh = xs->xattr_bh,
1491 .vb_access = ocfs2_journal_access_di,
1492 };
1493
1494 if (!(flag & OCFS2_INLINE_XATTR_FL)) {
1495 BUG_ON(xs->xattr_bh == xs->inode_bh);
1496 vb.vb_access = ocfs2_journal_access_xb;
1497 } else
1498 BUG_ON(xs->xattr_bh != xs->inode_bh);
1213 1499
1214 /* Compute min_offs, last and free space. */ 1500 /* Compute min_offs, last and free space. */
1215 last = xs->header->xh_entries; 1501 last = xs->header->xh_entries;
@@ -1265,15 +1551,14 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
1265 if (ocfs2_xattr_is_local(xs->here) && size == size_l) { 1551 if (ocfs2_xattr_is_local(xs->here) && size == size_l) {
1266 /* Replace existing local xattr with tree root */ 1552 /* Replace existing local xattr with tree root */
1267 ret = ocfs2_xattr_set_value_outside(inode, xi, xs, 1553 ret = ocfs2_xattr_set_value_outside(inode, xi, xs,
1268 offs); 1554 ctxt, &vb, offs);
1269 if (ret < 0) 1555 if (ret < 0)
1270 mlog_errno(ret); 1556 mlog_errno(ret);
1271 goto out; 1557 goto out;
1272 } else if (!ocfs2_xattr_is_local(xs->here)) { 1558 } else if (!ocfs2_xattr_is_local(xs->here)) {
1273 /* For existing xattr which has value outside */ 1559 /* For existing xattr which has value outside */
1274 struct ocfs2_xattr_value_root *xv = NULL; 1560 vb.vb_xv = (struct ocfs2_xattr_value_root *)
1275 xv = (struct ocfs2_xattr_value_root *)(val + 1561 (val + OCFS2_XATTR_SIZE(name_len));
1276 OCFS2_XATTR_SIZE(name_len));
1277 1562
1278 if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) { 1563 if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
1279 /* 1564 /*
@@ -1282,27 +1567,30 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
1282 * then set new value with set_value_outside(). 1567 * then set new value with set_value_outside().
1283 */ 1568 */
1284 ret = ocfs2_xattr_value_truncate(inode, 1569 ret = ocfs2_xattr_value_truncate(inode,
1285 xs->xattr_bh, 1570 &vb,
1286 xv, 1571 xi->value_len,
1287 xi->value_len); 1572 ctxt);
1288 if (ret < 0) { 1573 if (ret < 0) {
1289 mlog_errno(ret); 1574 mlog_errno(ret);
1290 goto out; 1575 goto out;
1291 } 1576 }
1292 1577
1293 ret = __ocfs2_xattr_set_value_outside(inode, 1578 ret = ocfs2_xattr_update_entry(inode,
1294 xv, 1579 handle,
1295 xi->value, 1580 xi,
1296 xi->value_len); 1581 xs,
1582 &vb,
1583 offs);
1297 if (ret < 0) { 1584 if (ret < 0) {
1298 mlog_errno(ret); 1585 mlog_errno(ret);
1299 goto out; 1586 goto out;
1300 } 1587 }
1301 1588
1302 ret = ocfs2_xattr_update_entry(inode, 1589 ret = __ocfs2_xattr_set_value_outside(inode,
1303 xi, 1590 handle,
1304 xs, 1591 vb.vb_xv,
1305 offs); 1592 xi->value,
1593 xi->value_len);
1306 if (ret < 0) 1594 if (ret < 0)
1307 mlog_errno(ret); 1595 mlog_errno(ret);
1308 goto out; 1596 goto out;
@@ -1312,44 +1600,28 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
1312 * just trucate old value to zero. 1600 * just trucate old value to zero.
1313 */ 1601 */
1314 ret = ocfs2_xattr_value_truncate(inode, 1602 ret = ocfs2_xattr_value_truncate(inode,
1315 xs->xattr_bh, 1603 &vb,
1316 xv, 1604 0,
1317 0); 1605 ctxt);
1318 if (ret < 0) 1606 if (ret < 0)
1319 mlog_errno(ret); 1607 mlog_errno(ret);
1320 } 1608 }
1321 } 1609 }
1322 } 1610 }
1323 1611
1324 handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), 1612 ret = ocfs2_journal_access_di(handle, inode, xs->inode_bh,
1325 OCFS2_INODE_UPDATE_CREDITS); 1613 OCFS2_JOURNAL_ACCESS_WRITE);
1326 if (IS_ERR(handle)) {
1327 ret = PTR_ERR(handle);
1328 mlog_errno(ret);
1329 goto out;
1330 }
1331
1332 ret = ocfs2_journal_access(handle, inode, xs->inode_bh,
1333 OCFS2_JOURNAL_ACCESS_WRITE);
1334 if (ret) { 1614 if (ret) {
1335 mlog_errno(ret); 1615 mlog_errno(ret);
1336 goto out_commit; 1616 goto out;
1337 } 1617 }
1338 1618
1339 if (!(flag & OCFS2_INLINE_XATTR_FL)) { 1619 if (!(flag & OCFS2_INLINE_XATTR_FL)) {
1340 /* set extended attribute in external block. */ 1620 ret = vb.vb_access(handle, inode, vb.vb_bh,
1341 ret = ocfs2_extend_trans(handle, 1621 OCFS2_JOURNAL_ACCESS_WRITE);
1342 OCFS2_INODE_UPDATE_CREDITS +
1343 OCFS2_XATTR_BLOCK_UPDATE_CREDITS);
1344 if (ret) {
1345 mlog_errno(ret);
1346 goto out_commit;
1347 }
1348 ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
1349 OCFS2_JOURNAL_ACCESS_WRITE);
1350 if (ret) { 1622 if (ret) {
1351 mlog_errno(ret); 1623 mlog_errno(ret);
1352 goto out_commit; 1624 goto out;
1353 } 1625 }
1354 } 1626 }
1355 1627
@@ -1363,7 +1635,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
1363 ret = ocfs2_journal_dirty(handle, xs->xattr_bh); 1635 ret = ocfs2_journal_dirty(handle, xs->xattr_bh);
1364 if (ret < 0) { 1636 if (ret < 0) {
1365 mlog_errno(ret); 1637 mlog_errno(ret);
1366 goto out_commit; 1638 goto out;
1367 } 1639 }
1368 } 1640 }
1369 1641
@@ -1391,25 +1663,19 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
1391 oi->ip_dyn_features |= flag; 1663 oi->ip_dyn_features |= flag;
1392 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); 1664 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
1393 spin_unlock(&oi->ip_lock); 1665 spin_unlock(&oi->ip_lock);
1394 /* Update inode ctime */
1395 inode->i_ctime = CURRENT_TIME;
1396 di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
1397 di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
1398 1666
1399 ret = ocfs2_journal_dirty(handle, xs->inode_bh); 1667 ret = ocfs2_journal_dirty(handle, xs->inode_bh);
1400 if (ret < 0) 1668 if (ret < 0)
1401 mlog_errno(ret); 1669 mlog_errno(ret);
1402 1670
1403out_commit:
1404 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
1405
1406 if (!ret && xi->value_len > OCFS2_XATTR_INLINE_SIZE) { 1671 if (!ret && xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
1407 /* 1672 /*
1408 * Set value outside in B tree. 1673 * Set value outside in B tree.
1409 * This is the second step for value size > INLINE_SIZE. 1674 * This is the second step for value size > INLINE_SIZE.
1410 */ 1675 */
1411 size_t offs = le16_to_cpu(xs->here->xe_name_offset); 1676 size_t offs = le16_to_cpu(xs->here->xe_name_offset);
1412 ret = ocfs2_xattr_set_value_outside(inode, xi, xs, offs); 1677 ret = ocfs2_xattr_set_value_outside(inode, xi, xs, ctxt,
1678 &vb, offs);
1413 if (ret < 0) { 1679 if (ret < 0) {
1414 int ret2; 1680 int ret2;
1415 1681
@@ -1418,41 +1684,56 @@ out_commit:
1418 * If set value outside failed, we have to clean 1684 * If set value outside failed, we have to clean
1419 * the junk tree root we have already set in local. 1685 * the junk tree root we have already set in local.
1420 */ 1686 */
1421 ret2 = ocfs2_xattr_cleanup(inode, xi, xs, offs); 1687 ret2 = ocfs2_xattr_cleanup(inode, ctxt->handle,
1688 xi, xs, &vb, offs);
1422 if (ret2 < 0) 1689 if (ret2 < 0)
1423 mlog_errno(ret2); 1690 mlog_errno(ret2);
1424 } 1691 }
1425 } 1692 }
1426out: 1693out:
1427 return ret; 1694 return ret;
1428
1429} 1695}
1430 1696
1431static int ocfs2_remove_value_outside(struct inode*inode, 1697static int ocfs2_remove_value_outside(struct inode*inode,
1432 struct buffer_head *bh, 1698 struct ocfs2_xattr_value_buf *vb,
1433 struct ocfs2_xattr_header *header) 1699 struct ocfs2_xattr_header *header)
1434{ 1700{
1435 int ret = 0, i; 1701 int ret = 0, i;
1702 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1703 struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, };
1704
1705 ocfs2_init_dealloc_ctxt(&ctxt.dealloc);
1706
1707 ctxt.handle = ocfs2_start_trans(osb,
1708 ocfs2_remove_extent_credits(osb->sb));
1709 if (IS_ERR(ctxt.handle)) {
1710 ret = PTR_ERR(ctxt.handle);
1711 mlog_errno(ret);
1712 goto out;
1713 }
1436 1714
1437 for (i = 0; i < le16_to_cpu(header->xh_count); i++) { 1715 for (i = 0; i < le16_to_cpu(header->xh_count); i++) {
1438 struct ocfs2_xattr_entry *entry = &header->xh_entries[i]; 1716 struct ocfs2_xattr_entry *entry = &header->xh_entries[i];
1439 1717
1440 if (!ocfs2_xattr_is_local(entry)) { 1718 if (!ocfs2_xattr_is_local(entry)) {
1441 struct ocfs2_xattr_value_root *xv;
1442 void *val; 1719 void *val;
1443 1720
1444 val = (void *)header + 1721 val = (void *)header +
1445 le16_to_cpu(entry->xe_name_offset); 1722 le16_to_cpu(entry->xe_name_offset);
1446 xv = (struct ocfs2_xattr_value_root *) 1723 vb->vb_xv = (struct ocfs2_xattr_value_root *)
1447 (val + OCFS2_XATTR_SIZE(entry->xe_name_len)); 1724 (val + OCFS2_XATTR_SIZE(entry->xe_name_len));
1448 ret = ocfs2_xattr_value_truncate(inode, bh, xv, 0); 1725 ret = ocfs2_xattr_value_truncate(inode, vb, 0, &ctxt);
1449 if (ret < 0) { 1726 if (ret < 0) {
1450 mlog_errno(ret); 1727 mlog_errno(ret);
1451 return ret; 1728 break;
1452 } 1729 }
1453 } 1730 }
1454 } 1731 }
1455 1732
1733 ocfs2_commit_trans(osb, ctxt.handle);
1734 ocfs2_schedule_truncate_log_flush(osb, 1);
1735 ocfs2_run_deallocs(osb, &ctxt.dealloc);
1736out:
1456 return ret; 1737 return ret;
1457} 1738}
1458 1739
@@ -1463,12 +1744,16 @@ static int ocfs2_xattr_ibody_remove(struct inode *inode,
1463 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 1744 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
1464 struct ocfs2_xattr_header *header; 1745 struct ocfs2_xattr_header *header;
1465 int ret; 1746 int ret;
1747 struct ocfs2_xattr_value_buf vb = {
1748 .vb_bh = di_bh,
1749 .vb_access = ocfs2_journal_access_di,
1750 };
1466 1751
1467 header = (struct ocfs2_xattr_header *) 1752 header = (struct ocfs2_xattr_header *)
1468 ((void *)di + inode->i_sb->s_blocksize - 1753 ((void *)di + inode->i_sb->s_blocksize -
1469 le16_to_cpu(di->i_xattr_inline_size)); 1754 le16_to_cpu(di->i_xattr_inline_size));
1470 1755
1471 ret = ocfs2_remove_value_outside(inode, di_bh, header); 1756 ret = ocfs2_remove_value_outside(inode, &vb, header);
1472 1757
1473 return ret; 1758 return ret;
1474} 1759}
@@ -1478,11 +1763,15 @@ static int ocfs2_xattr_block_remove(struct inode *inode,
1478{ 1763{
1479 struct ocfs2_xattr_block *xb; 1764 struct ocfs2_xattr_block *xb;
1480 int ret = 0; 1765 int ret = 0;
1766 struct ocfs2_xattr_value_buf vb = {
1767 .vb_bh = blk_bh,
1768 .vb_access = ocfs2_journal_access_xb,
1769 };
1481 1770
1482 xb = (struct ocfs2_xattr_block *)blk_bh->b_data; 1771 xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
1483 if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) { 1772 if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
1484 struct ocfs2_xattr_header *header = &(xb->xb_attrs.xb_header); 1773 struct ocfs2_xattr_header *header = &(xb->xb_attrs.xb_header);
1485 ret = ocfs2_remove_value_outside(inode, blk_bh, header); 1774 ret = ocfs2_remove_value_outside(inode, &vb, header);
1486 } else 1775 } else
1487 ret = ocfs2_delete_xattr_index_block(inode, blk_bh); 1776 ret = ocfs2_delete_xattr_index_block(inode, blk_bh);
1488 1777
@@ -1502,24 +1791,19 @@ static int ocfs2_xattr_free_block(struct inode *inode,
1502 u64 blk, bg_blkno; 1791 u64 blk, bg_blkno;
1503 u16 bit; 1792 u16 bit;
1504 1793
1505 ret = ocfs2_read_block(inode, block, &blk_bh); 1794 ret = ocfs2_read_xattr_block(inode, block, &blk_bh);
1506 if (ret < 0) { 1795 if (ret < 0) {
1507 mlog_errno(ret); 1796 mlog_errno(ret);
1508 goto out; 1797 goto out;
1509 } 1798 }
1510 1799
1511 xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
1512 if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
1513 ret = -EIO;
1514 goto out;
1515 }
1516
1517 ret = ocfs2_xattr_block_remove(inode, blk_bh); 1800 ret = ocfs2_xattr_block_remove(inode, blk_bh);
1518 if (ret < 0) { 1801 if (ret < 0) {
1519 mlog_errno(ret); 1802 mlog_errno(ret);
1520 goto out; 1803 goto out;
1521 } 1804 }
1522 1805
1806 xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
1523 blk = le64_to_cpu(xb->xb_blkno); 1807 blk = le64_to_cpu(xb->xb_blkno);
1524 bit = le16_to_cpu(xb->xb_suballoc_bit); 1808 bit = le16_to_cpu(xb->xb_suballoc_bit);
1525 bg_blkno = ocfs2_which_suballoc_group(blk, bit); 1809 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
@@ -1606,8 +1890,8 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
1606 mlog_errno(ret); 1890 mlog_errno(ret);
1607 goto out; 1891 goto out;
1608 } 1892 }
1609 ret = ocfs2_journal_access(handle, inode, di_bh, 1893 ret = ocfs2_journal_access_di(handle, inode, di_bh,
1610 OCFS2_JOURNAL_ACCESS_WRITE); 1894 OCFS2_JOURNAL_ACCESS_WRITE);
1611 if (ret) { 1895 if (ret) {
1612 mlog_errno(ret); 1896 mlog_errno(ret);
1613 goto out_commit; 1897 goto out_commit;
@@ -1714,7 +1998,8 @@ static int ocfs2_xattr_ibody_find(struct inode *inode,
1714 */ 1998 */
1715static int ocfs2_xattr_ibody_set(struct inode *inode, 1999static int ocfs2_xattr_ibody_set(struct inode *inode,
1716 struct ocfs2_xattr_info *xi, 2000 struct ocfs2_xattr_info *xi,
1717 struct ocfs2_xattr_search *xs) 2001 struct ocfs2_xattr_search *xs,
2002 struct ocfs2_xattr_set_ctxt *ctxt)
1718{ 2003{
1719 struct ocfs2_inode_info *oi = OCFS2_I(inode); 2004 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1720 struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; 2005 struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
@@ -1731,7 +2016,7 @@ static int ocfs2_xattr_ibody_set(struct inode *inode,
1731 } 2016 }
1732 } 2017 }
1733 2018
1734 ret = ocfs2_xattr_set_entry(inode, xi, xs, 2019 ret = ocfs2_xattr_set_entry(inode, xi, xs, ctxt,
1735 (OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL)); 2020 (OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL));
1736out: 2021out:
1737 up_write(&oi->ip_alloc_sem); 2022 up_write(&oi->ip_alloc_sem);
@@ -1758,19 +2043,15 @@ static int ocfs2_xattr_block_find(struct inode *inode,
1758 if (!di->i_xattr_loc) 2043 if (!di->i_xattr_loc)
1759 return ret; 2044 return ret;
1760 2045
1761 ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh); 2046 ret = ocfs2_read_xattr_block(inode, le64_to_cpu(di->i_xattr_loc),
2047 &blk_bh);
1762 if (ret < 0) { 2048 if (ret < 0) {
1763 mlog_errno(ret); 2049 mlog_errno(ret);
1764 return ret; 2050 return ret;
1765 } 2051 }
1766 2052
1767 xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
1768 if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
1769 ret = -EIO;
1770 goto cleanup;
1771 }
1772
1773 xs->xattr_bh = blk_bh; 2053 xs->xattr_bh = blk_bh;
2054 xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
1774 2055
1775 if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) { 2056 if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
1776 xs->header = &xb->xb_attrs.xb_header; 2057 xs->header = &xb->xb_attrs.xb_header;
@@ -1804,13 +2085,13 @@ cleanup:
1804 */ 2085 */
1805static int ocfs2_xattr_block_set(struct inode *inode, 2086static int ocfs2_xattr_block_set(struct inode *inode,
1806 struct ocfs2_xattr_info *xi, 2087 struct ocfs2_xattr_info *xi,
1807 struct ocfs2_xattr_search *xs) 2088 struct ocfs2_xattr_search *xs,
2089 struct ocfs2_xattr_set_ctxt *ctxt)
1808{ 2090{
1809 struct buffer_head *new_bh = NULL; 2091 struct buffer_head *new_bh = NULL;
1810 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2092 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1811 struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; 2093 struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
1812 struct ocfs2_alloc_context *meta_ac = NULL; 2094 handle_t *handle = ctxt->handle;
1813 handle_t *handle = NULL;
1814 struct ocfs2_xattr_block *xblk = NULL; 2095 struct ocfs2_xattr_block *xblk = NULL;
1815 u16 suballoc_bit_start; 2096 u16 suballoc_bit_start;
1816 u32 num_got; 2097 u32 num_got;
@@ -1818,45 +2099,29 @@ static int ocfs2_xattr_block_set(struct inode *inode,
1818 int ret; 2099 int ret;
1819 2100
1820 if (!xs->xattr_bh) { 2101 if (!xs->xattr_bh) {
1821 /* 2102 ret = ocfs2_journal_access_di(handle, inode, xs->inode_bh,
1822 * Alloc one external block for extended attribute 2103 OCFS2_JOURNAL_ACCESS_CREATE);
1823 * outside of inode.
1824 */
1825 ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
1826 if (ret < 0) { 2104 if (ret < 0) {
1827 mlog_errno(ret); 2105 mlog_errno(ret);
1828 goto out; 2106 goto end;
1829 }
1830 handle = ocfs2_start_trans(osb,
1831 OCFS2_XATTR_BLOCK_CREATE_CREDITS);
1832 if (IS_ERR(handle)) {
1833 ret = PTR_ERR(handle);
1834 mlog_errno(ret);
1835 goto out;
1836 }
1837 ret = ocfs2_journal_access(handle, inode, xs->inode_bh,
1838 OCFS2_JOURNAL_ACCESS_CREATE);
1839 if (ret < 0) {
1840 mlog_errno(ret);
1841 goto out_commit;
1842 } 2107 }
1843 2108
1844 ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1, 2109 ret = ocfs2_claim_metadata(osb, handle, ctxt->meta_ac, 1,
1845 &suballoc_bit_start, &num_got, 2110 &suballoc_bit_start, &num_got,
1846 &first_blkno); 2111 &first_blkno);
1847 if (ret < 0) { 2112 if (ret < 0) {
1848 mlog_errno(ret); 2113 mlog_errno(ret);
1849 goto out_commit; 2114 goto end;
1850 } 2115 }
1851 2116
1852 new_bh = sb_getblk(inode->i_sb, first_blkno); 2117 new_bh = sb_getblk(inode->i_sb, first_blkno);
1853 ocfs2_set_new_buffer_uptodate(inode, new_bh); 2118 ocfs2_set_new_buffer_uptodate(inode, new_bh);
1854 2119
1855 ret = ocfs2_journal_access(handle, inode, new_bh, 2120 ret = ocfs2_journal_access_xb(handle, inode, new_bh,
1856 OCFS2_JOURNAL_ACCESS_CREATE); 2121 OCFS2_JOURNAL_ACCESS_CREATE);
1857 if (ret < 0) { 2122 if (ret < 0) {
1858 mlog_errno(ret); 2123 mlog_errno(ret);
1859 goto out_commit; 2124 goto end;
1860 } 2125 }
1861 2126
1862 /* Initialize ocfs2_xattr_block */ 2127 /* Initialize ocfs2_xattr_block */
@@ -1874,44 +2139,555 @@ static int ocfs2_xattr_block_set(struct inode *inode,
1874 xs->end = (void *)xblk + inode->i_sb->s_blocksize; 2139 xs->end = (void *)xblk + inode->i_sb->s_blocksize;
1875 xs->here = xs->header->xh_entries; 2140 xs->here = xs->header->xh_entries;
1876 2141
1877
1878 ret = ocfs2_journal_dirty(handle, new_bh); 2142 ret = ocfs2_journal_dirty(handle, new_bh);
1879 if (ret < 0) { 2143 if (ret < 0) {
1880 mlog_errno(ret); 2144 mlog_errno(ret);
1881 goto out_commit; 2145 goto end;
1882 } 2146 }
1883 di->i_xattr_loc = cpu_to_le64(first_blkno); 2147 di->i_xattr_loc = cpu_to_le64(first_blkno);
1884 ret = ocfs2_journal_dirty(handle, xs->inode_bh); 2148 ocfs2_journal_dirty(handle, xs->inode_bh);
1885 if (ret < 0)
1886 mlog_errno(ret);
1887out_commit:
1888 ocfs2_commit_trans(osb, handle);
1889out:
1890 if (meta_ac)
1891 ocfs2_free_alloc_context(meta_ac);
1892 if (ret < 0)
1893 return ret;
1894 } else 2149 } else
1895 xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data; 2150 xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
1896 2151
1897 if (!(le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED)) { 2152 if (!(le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED)) {
1898 /* Set extended attribute into external block */ 2153 /* Set extended attribute into external block */
1899 ret = ocfs2_xattr_set_entry(inode, xi, xs, OCFS2_HAS_XATTR_FL); 2154 ret = ocfs2_xattr_set_entry(inode, xi, xs, ctxt,
2155 OCFS2_HAS_XATTR_FL);
1900 if (!ret || ret != -ENOSPC) 2156 if (!ret || ret != -ENOSPC)
1901 goto end; 2157 goto end;
1902 2158
1903 ret = ocfs2_xattr_create_index_block(inode, xs); 2159 ret = ocfs2_xattr_create_index_block(inode, xs, ctxt);
1904 if (ret) 2160 if (ret)
1905 goto end; 2161 goto end;
1906 } 2162 }
1907 2163
1908 ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs); 2164 ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs, ctxt);
1909 2165
1910end: 2166end:
1911 2167
1912 return ret; 2168 return ret;
1913} 2169}
1914 2170
2171/* Check whether the new xattr can be inserted into the inode. */
2172static int ocfs2_xattr_can_be_in_inode(struct inode *inode,
2173 struct ocfs2_xattr_info *xi,
2174 struct ocfs2_xattr_search *xs)
2175{
2176 u64 value_size;
2177 struct ocfs2_xattr_entry *last;
2178 int free, i;
2179 size_t min_offs = xs->end - xs->base;
2180
2181 if (!xs->header)
2182 return 0;
2183
2184 last = xs->header->xh_entries;
2185
2186 for (i = 0; i < le16_to_cpu(xs->header->xh_count); i++) {
2187 size_t offs = le16_to_cpu(last->xe_name_offset);
2188 if (offs < min_offs)
2189 min_offs = offs;
2190 last += 1;
2191 }
2192
2193 free = min_offs - ((void *)last - xs->base) - sizeof(__u32);
2194 if (free < 0)
2195 return 0;
2196
2197 BUG_ON(!xs->not_found);
2198
2199 if (xi->value_len > OCFS2_XATTR_INLINE_SIZE)
2200 value_size = OCFS2_XATTR_ROOT_SIZE;
2201 else
2202 value_size = OCFS2_XATTR_SIZE(xi->value_len);
2203
2204 if (free >= sizeof(struct ocfs2_xattr_entry) +
2205 OCFS2_XATTR_SIZE(strlen(xi->name)) + value_size)
2206 return 1;
2207
2208 return 0;
2209}
2210
2211static int ocfs2_calc_xattr_set_need(struct inode *inode,
2212 struct ocfs2_dinode *di,
2213 struct ocfs2_xattr_info *xi,
2214 struct ocfs2_xattr_search *xis,
2215 struct ocfs2_xattr_search *xbs,
2216 int *clusters_need,
2217 int *meta_need,
2218 int *credits_need)
2219{
2220 int ret = 0, old_in_xb = 0;
2221 int clusters_add = 0, meta_add = 0, credits = 0;
2222 struct buffer_head *bh = NULL;
2223 struct ocfs2_xattr_block *xb = NULL;
2224 struct ocfs2_xattr_entry *xe = NULL;
2225 struct ocfs2_xattr_value_root *xv = NULL;
2226 char *base = NULL;
2227 int name_offset, name_len = 0;
2228 u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb,
2229 xi->value_len);
2230 u64 value_size;
2231
2232 /*
2233 * Calculate the clusters we need to write.
2234 * No matter whether we replace an old one or add a new one,
2235 * we need this for writing.
2236 */
2237 if (xi->value_len > OCFS2_XATTR_INLINE_SIZE)
2238 credits += new_clusters *
2239 ocfs2_clusters_to_blocks(inode->i_sb, 1);
2240
2241 if (xis->not_found && xbs->not_found) {
2242 credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
2243
2244 if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
2245 clusters_add += new_clusters;
2246 credits += ocfs2_calc_extend_credits(inode->i_sb,
2247 &def_xv.xv.xr_list,
2248 new_clusters);
2249 }
2250
2251 goto meta_guess;
2252 }
2253
2254 if (!xis->not_found) {
2255 xe = xis->here;
2256 name_offset = le16_to_cpu(xe->xe_name_offset);
2257 name_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
2258 base = xis->base;
2259 credits += OCFS2_INODE_UPDATE_CREDITS;
2260 } else {
2261 int i, block_off = 0;
2262 xb = (struct ocfs2_xattr_block *)xbs->xattr_bh->b_data;
2263 xe = xbs->here;
2264 name_offset = le16_to_cpu(xe->xe_name_offset);
2265 name_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
2266 i = xbs->here - xbs->header->xh_entries;
2267 old_in_xb = 1;
2268
2269 if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
2270 ret = ocfs2_xattr_bucket_get_name_value(inode,
2271 bucket_xh(xbs->bucket),
2272 i, &block_off,
2273 &name_offset);
2274 base = bucket_block(xbs->bucket, block_off);
2275 credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
2276 } else {
2277 base = xbs->base;
2278 credits += OCFS2_XATTR_BLOCK_UPDATE_CREDITS;
2279 }
2280 }
2281
2282 /*
2283 * delete a xattr doesn't need metadata and cluster allocation.
2284 * so just calculate the credits and return.
2285 *
2286 * The credits for removing the value tree will be extended
2287 * by ocfs2_remove_extent itself.
2288 */
2289 if (!xi->value) {
2290 if (!ocfs2_xattr_is_local(xe))
2291 credits += ocfs2_remove_extent_credits(inode->i_sb);
2292
2293 goto out;
2294 }
2295
2296 /* do cluster allocation guess first. */
2297 value_size = le64_to_cpu(xe->xe_value_size);
2298
2299 if (old_in_xb) {
2300 /*
2301 * In xattr set, we always try to set the xe in inode first,
2302 * so if it can be inserted into inode successfully, the old
2303 * one will be removed from the xattr block, and this xattr
2304 * will be inserted into inode as a new xattr in inode.
2305 */
2306 if (ocfs2_xattr_can_be_in_inode(inode, xi, xis)) {
2307 clusters_add += new_clusters;
2308 credits += ocfs2_remove_extent_credits(inode->i_sb) +
2309 OCFS2_INODE_UPDATE_CREDITS;
2310 if (!ocfs2_xattr_is_local(xe))
2311 credits += ocfs2_calc_extend_credits(
2312 inode->i_sb,
2313 &def_xv.xv.xr_list,
2314 new_clusters);
2315 goto out;
2316 }
2317 }
2318
2319 if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
2320 /* the new values will be stored outside. */
2321 u32 old_clusters = 0;
2322
2323 if (!ocfs2_xattr_is_local(xe)) {
2324 old_clusters = ocfs2_clusters_for_bytes(inode->i_sb,
2325 value_size);
2326 xv = (struct ocfs2_xattr_value_root *)
2327 (base + name_offset + name_len);
2328 value_size = OCFS2_XATTR_ROOT_SIZE;
2329 } else
2330 xv = &def_xv.xv;
2331
2332 if (old_clusters >= new_clusters) {
2333 credits += ocfs2_remove_extent_credits(inode->i_sb);
2334 goto out;
2335 } else {
2336 meta_add += ocfs2_extend_meta_needed(&xv->xr_list);
2337 clusters_add += new_clusters - old_clusters;
2338 credits += ocfs2_calc_extend_credits(inode->i_sb,
2339 &xv->xr_list,
2340 new_clusters -
2341 old_clusters);
2342 if (value_size >= OCFS2_XATTR_ROOT_SIZE)
2343 goto out;
2344 }
2345 } else {
2346 /*
2347 * Now the new value will be stored inside. So if the new
2348 * value is smaller than the size of value root or the old
2349 * value, we don't need any allocation, otherwise we have
2350 * to guess metadata allocation.
2351 */
2352 if ((ocfs2_xattr_is_local(xe) && value_size >= xi->value_len) ||
2353 (!ocfs2_xattr_is_local(xe) &&
2354 OCFS2_XATTR_ROOT_SIZE >= xi->value_len))
2355 goto out;
2356 }
2357
2358meta_guess:
2359 /* calculate metadata allocation. */
2360 if (di->i_xattr_loc) {
2361 if (!xbs->xattr_bh) {
2362 ret = ocfs2_read_xattr_block(inode,
2363 le64_to_cpu(di->i_xattr_loc),
2364 &bh);
2365 if (ret) {
2366 mlog_errno(ret);
2367 goto out;
2368 }
2369
2370 xb = (struct ocfs2_xattr_block *)bh->b_data;
2371 } else
2372 xb = (struct ocfs2_xattr_block *)xbs->xattr_bh->b_data;
2373
2374 /*
2375 * If there is already an xattr tree, good, we can calculate
2376 * like other b-trees. Otherwise we may have the chance of
2377 * create a tree, the credit calculation is borrowed from
2378 * ocfs2_calc_extend_credits with root_el = NULL. And the
2379 * new tree will be cluster based, so no meta is needed.
2380 */
2381 if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
2382 struct ocfs2_extent_list *el =
2383 &xb->xb_attrs.xb_root.xt_list;
2384 meta_add += ocfs2_extend_meta_needed(el);
2385 credits += ocfs2_calc_extend_credits(inode->i_sb,
2386 el, 1);
2387 } else
2388 credits += OCFS2_SUBALLOC_ALLOC + 1;
2389
2390 /*
2391 * This cluster will be used either for new bucket or for
2392 * new xattr block.
2393 * If the cluster size is the same as the bucket size, one
2394 * more is needed since we may need to extend the bucket
2395 * also.
2396 */
2397 clusters_add += 1;
2398 credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
2399 if (OCFS2_XATTR_BUCKET_SIZE ==
2400 OCFS2_SB(inode->i_sb)->s_clustersize) {
2401 credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
2402 clusters_add += 1;
2403 }
2404 } else {
2405 meta_add += 1;
2406 credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS;
2407 }
2408out:
2409 if (clusters_need)
2410 *clusters_need = clusters_add;
2411 if (meta_need)
2412 *meta_need = meta_add;
2413 if (credits_need)
2414 *credits_need = credits;
2415 brelse(bh);
2416 return ret;
2417}
2418
2419static int ocfs2_init_xattr_set_ctxt(struct inode *inode,
2420 struct ocfs2_dinode *di,
2421 struct ocfs2_xattr_info *xi,
2422 struct ocfs2_xattr_search *xis,
2423 struct ocfs2_xattr_search *xbs,
2424 struct ocfs2_xattr_set_ctxt *ctxt,
2425 int *credits)
2426{
2427 int clusters_add, meta_add, ret;
2428 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2429
2430 memset(ctxt, 0, sizeof(struct ocfs2_xattr_set_ctxt));
2431
2432 ocfs2_init_dealloc_ctxt(&ctxt->dealloc);
2433
2434 ret = ocfs2_calc_xattr_set_need(inode, di, xi, xis, xbs,
2435 &clusters_add, &meta_add, credits);
2436 if (ret) {
2437 mlog_errno(ret);
2438 return ret;
2439 }
2440
2441 mlog(0, "Set xattr %s, reserve meta blocks = %d, clusters = %d, "
2442 "credits = %d\n", xi->name, meta_add, clusters_add, *credits);
2443
2444 if (meta_add) {
2445 ret = ocfs2_reserve_new_metadata_blocks(osb, meta_add,
2446 &ctxt->meta_ac);
2447 if (ret) {
2448 mlog_errno(ret);
2449 goto out;
2450 }
2451 }
2452
2453 if (clusters_add) {
2454 ret = ocfs2_reserve_clusters(osb, clusters_add, &ctxt->data_ac);
2455 if (ret)
2456 mlog_errno(ret);
2457 }
2458out:
2459 if (ret) {
2460 if (ctxt->meta_ac) {
2461 ocfs2_free_alloc_context(ctxt->meta_ac);
2462 ctxt->meta_ac = NULL;
2463 }
2464
2465 /*
2466 * We cannot have an error and a non null ctxt->data_ac.
2467 */
2468 }
2469
2470 return ret;
2471}
2472
2473static int __ocfs2_xattr_set_handle(struct inode *inode,
2474 struct ocfs2_dinode *di,
2475 struct ocfs2_xattr_info *xi,
2476 struct ocfs2_xattr_search *xis,
2477 struct ocfs2_xattr_search *xbs,
2478 struct ocfs2_xattr_set_ctxt *ctxt)
2479{
2480 int ret = 0, credits, old_found;
2481
2482 if (!xi->value) {
2483 /* Remove existing extended attribute */
2484 if (!xis->not_found)
2485 ret = ocfs2_xattr_ibody_set(inode, xi, xis, ctxt);
2486 else if (!xbs->not_found)
2487 ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt);
2488 } else {
2489 /* We always try to set extended attribute into inode first*/
2490 ret = ocfs2_xattr_ibody_set(inode, xi, xis, ctxt);
2491 if (!ret && !xbs->not_found) {
2492 /*
2493 * If succeed and that extended attribute existing in
2494 * external block, then we will remove it.
2495 */
2496 xi->value = NULL;
2497 xi->value_len = 0;
2498
2499 old_found = xis->not_found;
2500 xis->not_found = -ENODATA;
2501 ret = ocfs2_calc_xattr_set_need(inode,
2502 di,
2503 xi,
2504 xis,
2505 xbs,
2506 NULL,
2507 NULL,
2508 &credits);
2509 xis->not_found = old_found;
2510 if (ret) {
2511 mlog_errno(ret);
2512 goto out;
2513 }
2514
2515 ret = ocfs2_extend_trans(ctxt->handle, credits +
2516 ctxt->handle->h_buffer_credits);
2517 if (ret) {
2518 mlog_errno(ret);
2519 goto out;
2520 }
2521 ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt);
2522 } else if (ret == -ENOSPC) {
2523 if (di->i_xattr_loc && !xbs->xattr_bh) {
2524 ret = ocfs2_xattr_block_find(inode,
2525 xi->name_index,
2526 xi->name, xbs);
2527 if (ret)
2528 goto out;
2529
2530 old_found = xis->not_found;
2531 xis->not_found = -ENODATA;
2532 ret = ocfs2_calc_xattr_set_need(inode,
2533 di,
2534 xi,
2535 xis,
2536 xbs,
2537 NULL,
2538 NULL,
2539 &credits);
2540 xis->not_found = old_found;
2541 if (ret) {
2542 mlog_errno(ret);
2543 goto out;
2544 }
2545
2546 ret = ocfs2_extend_trans(ctxt->handle, credits +
2547 ctxt->handle->h_buffer_credits);
2548 if (ret) {
2549 mlog_errno(ret);
2550 goto out;
2551 }
2552 }
2553 /*
2554 * If no space in inode, we will set extended attribute
2555 * into external block.
2556 */
2557 ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt);
2558 if (ret)
2559 goto out;
2560 if (!xis->not_found) {
2561 /*
2562 * If succeed and that extended attribute
2563 * existing in inode, we will remove it.
2564 */
2565 xi->value = NULL;
2566 xi->value_len = 0;
2567 xbs->not_found = -ENODATA;
2568 ret = ocfs2_calc_xattr_set_need(inode,
2569 di,
2570 xi,
2571 xis,
2572 xbs,
2573 NULL,
2574 NULL,
2575 &credits);
2576 if (ret) {
2577 mlog_errno(ret);
2578 goto out;
2579 }
2580
2581 ret = ocfs2_extend_trans(ctxt->handle, credits +
2582 ctxt->handle->h_buffer_credits);
2583 if (ret) {
2584 mlog_errno(ret);
2585 goto out;
2586 }
2587 ret = ocfs2_xattr_ibody_set(inode, xi,
2588 xis, ctxt);
2589 }
2590 }
2591 }
2592
2593 if (!ret) {
2594 /* Update inode ctime. */
2595 ret = ocfs2_journal_access(ctxt->handle, inode, xis->inode_bh,
2596 OCFS2_JOURNAL_ACCESS_WRITE);
2597 if (ret) {
2598 mlog_errno(ret);
2599 goto out;
2600 }
2601
2602 inode->i_ctime = CURRENT_TIME;
2603 di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
2604 di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
2605 ocfs2_journal_dirty(ctxt->handle, xis->inode_bh);
2606 }
2607out:
2608 return ret;
2609}
2610
2611/*
2612 * This function only called duing creating inode
2613 * for init security/acl xattrs of the new inode.
2614 * All transanction credits have been reserved in mknod.
2615 */
2616int ocfs2_xattr_set_handle(handle_t *handle,
2617 struct inode *inode,
2618 struct buffer_head *di_bh,
2619 int name_index,
2620 const char *name,
2621 const void *value,
2622 size_t value_len,
2623 int flags,
2624 struct ocfs2_alloc_context *meta_ac,
2625 struct ocfs2_alloc_context *data_ac)
2626{
2627 struct ocfs2_dinode *di;
2628 int ret;
2629
2630 struct ocfs2_xattr_info xi = {
2631 .name_index = name_index,
2632 .name = name,
2633 .value = value,
2634 .value_len = value_len,
2635 };
2636
2637 struct ocfs2_xattr_search xis = {
2638 .not_found = -ENODATA,
2639 };
2640
2641 struct ocfs2_xattr_search xbs = {
2642 .not_found = -ENODATA,
2643 };
2644
2645 struct ocfs2_xattr_set_ctxt ctxt = {
2646 .handle = handle,
2647 .meta_ac = meta_ac,
2648 .data_ac = data_ac,
2649 };
2650
2651 if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb)))
2652 return -EOPNOTSUPP;
2653
2654 /*
2655 * In extreme situation, may need xattr bucket when
2656 * block size is too small. And we have already reserved
2657 * the credits for bucket in mknod.
2658 */
2659 if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE) {
2660 xbs.bucket = ocfs2_xattr_bucket_new(inode);
2661 if (!xbs.bucket) {
2662 mlog_errno(-ENOMEM);
2663 return -ENOMEM;
2664 }
2665 }
2666
2667 xis.inode_bh = xbs.inode_bh = di_bh;
2668 di = (struct ocfs2_dinode *)di_bh->b_data;
2669
2670 down_write(&OCFS2_I(inode)->ip_xattr_sem);
2671
2672 ret = ocfs2_xattr_ibody_find(inode, name_index, name, &xis);
2673 if (ret)
2674 goto cleanup;
2675 if (xis.not_found) {
2676 ret = ocfs2_xattr_block_find(inode, name_index, name, &xbs);
2677 if (ret)
2678 goto cleanup;
2679 }
2680
2681 ret = __ocfs2_xattr_set_handle(inode, di, &xi, &xis, &xbs, &ctxt);
2682
2683cleanup:
2684 up_write(&OCFS2_I(inode)->ip_xattr_sem);
2685 brelse(xbs.xattr_bh);
2686 ocfs2_xattr_bucket_free(xbs.bucket);
2687
2688 return ret;
2689}
2690
1915/* 2691/*
1916 * ocfs2_xattr_set() 2692 * ocfs2_xattr_set()
1917 * 2693 *
@@ -1928,8 +2704,10 @@ int ocfs2_xattr_set(struct inode *inode,
1928{ 2704{
1929 struct buffer_head *di_bh = NULL; 2705 struct buffer_head *di_bh = NULL;
1930 struct ocfs2_dinode *di; 2706 struct ocfs2_dinode *di;
1931 int ret; 2707 int ret, credits;
1932 u16 i, blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); 2708 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2709 struct inode *tl_inode = osb->osb_tl_inode;
2710 struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, };
1933 2711
1934 struct ocfs2_xattr_info xi = { 2712 struct ocfs2_xattr_info xi = {
1935 .name_index = name_index, 2713 .name_index = name_index,
@@ -1949,10 +2727,20 @@ int ocfs2_xattr_set(struct inode *inode,
1949 if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb))) 2727 if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb)))
1950 return -EOPNOTSUPP; 2728 return -EOPNOTSUPP;
1951 2729
2730 /*
2731 * Only xbs will be used on indexed trees. xis doesn't need a
2732 * bucket.
2733 */
2734 xbs.bucket = ocfs2_xattr_bucket_new(inode);
2735 if (!xbs.bucket) {
2736 mlog_errno(-ENOMEM);
2737 return -ENOMEM;
2738 }
2739
1952 ret = ocfs2_inode_lock(inode, &di_bh, 1); 2740 ret = ocfs2_inode_lock(inode, &di_bh, 1);
1953 if (ret < 0) { 2741 if (ret < 0) {
1954 mlog_errno(ret); 2742 mlog_errno(ret);
1955 return ret; 2743 goto cleanup_nolock;
1956 } 2744 }
1957 xis.inode_bh = xbs.inode_bh = di_bh; 2745 xis.inode_bh = xbs.inode_bh = di_bh;
1958 di = (struct ocfs2_dinode *)di_bh->b_data; 2746 di = (struct ocfs2_dinode *)di_bh->b_data;
@@ -1984,55 +2772,53 @@ int ocfs2_xattr_set(struct inode *inode,
1984 goto cleanup; 2772 goto cleanup;
1985 } 2773 }
1986 2774
1987 if (!value) { 2775
1988 /* Remove existing extended attribute */ 2776 mutex_lock(&tl_inode->i_mutex);
1989 if (!xis.not_found) 2777
1990 ret = ocfs2_xattr_ibody_set(inode, &xi, &xis); 2778 if (ocfs2_truncate_log_needs_flush(osb)) {
1991 else if (!xbs.not_found) 2779 ret = __ocfs2_flush_truncate_log(osb);
1992 ret = ocfs2_xattr_block_set(inode, &xi, &xbs); 2780 if (ret < 0) {
1993 } else { 2781 mutex_unlock(&tl_inode->i_mutex);
1994 /* We always try to set extended attribute into inode first*/ 2782 mlog_errno(ret);
1995 ret = ocfs2_xattr_ibody_set(inode, &xi, &xis); 2783 goto cleanup;
1996 if (!ret && !xbs.not_found) {
1997 /*
1998 * If succeed and that extended attribute existing in
1999 * external block, then we will remove it.
2000 */
2001 xi.value = NULL;
2002 xi.value_len = 0;
2003 ret = ocfs2_xattr_block_set(inode, &xi, &xbs);
2004 } else if (ret == -ENOSPC) {
2005 if (di->i_xattr_loc && !xbs.xattr_bh) {
2006 ret = ocfs2_xattr_block_find(inode, name_index,
2007 name, &xbs);
2008 if (ret)
2009 goto cleanup;
2010 }
2011 /*
2012 * If no space in inode, we will set extended attribute
2013 * into external block.
2014 */
2015 ret = ocfs2_xattr_block_set(inode, &xi, &xbs);
2016 if (ret)
2017 goto cleanup;
2018 if (!xis.not_found) {
2019 /*
2020 * If succeed and that extended attribute
2021 * existing in inode, we will remove it.
2022 */
2023 xi.value = NULL;
2024 xi.value_len = 0;
2025 ret = ocfs2_xattr_ibody_set(inode, &xi, &xis);
2026 }
2027 } 2784 }
2028 } 2785 }
2786 mutex_unlock(&tl_inode->i_mutex);
2787
2788 ret = ocfs2_init_xattr_set_ctxt(inode, di, &xi, &xis,
2789 &xbs, &ctxt, &credits);
2790 if (ret) {
2791 mlog_errno(ret);
2792 goto cleanup;
2793 }
2794
2795 /* we need to update inode's ctime field, so add credit for it. */
2796 credits += OCFS2_INODE_UPDATE_CREDITS;
2797 ctxt.handle = ocfs2_start_trans(osb, credits);
2798 if (IS_ERR(ctxt.handle)) {
2799 ret = PTR_ERR(ctxt.handle);
2800 mlog_errno(ret);
2801 goto cleanup;
2802 }
2803
2804 ret = __ocfs2_xattr_set_handle(inode, di, &xi, &xis, &xbs, &ctxt);
2805
2806 ocfs2_commit_trans(osb, ctxt.handle);
2807
2808 if (ctxt.data_ac)
2809 ocfs2_free_alloc_context(ctxt.data_ac);
2810 if (ctxt.meta_ac)
2811 ocfs2_free_alloc_context(ctxt.meta_ac);
2812 if (ocfs2_dealloc_has_cluster(&ctxt.dealloc))
2813 ocfs2_schedule_truncate_log_flush(osb, 1);
2814 ocfs2_run_deallocs(osb, &ctxt.dealloc);
2029cleanup: 2815cleanup:
2030 up_write(&OCFS2_I(inode)->ip_xattr_sem); 2816 up_write(&OCFS2_I(inode)->ip_xattr_sem);
2031 ocfs2_inode_unlock(inode, 1); 2817 ocfs2_inode_unlock(inode, 1);
2818cleanup_nolock:
2032 brelse(di_bh); 2819 brelse(di_bh);
2033 brelse(xbs.xattr_bh); 2820 brelse(xbs.xattr_bh);
2034 for (i = 0; i < blk_per_bucket; i++) 2821 ocfs2_xattr_bucket_free(xbs.bucket);
2035 brelse(xbs.bucket.bhs[i]);
2036 2822
2037 return ret; 2823 return ret;
2038} 2824}
@@ -2107,7 +2893,7 @@ typedef int (xattr_bucket_func)(struct inode *inode,
2107 void *para); 2893 void *para);
2108 2894
2109static int ocfs2_find_xe_in_bucket(struct inode *inode, 2895static int ocfs2_find_xe_in_bucket(struct inode *inode,
2110 struct buffer_head *header_bh, 2896 struct ocfs2_xattr_bucket *bucket,
2111 int name_index, 2897 int name_index,
2112 const char *name, 2898 const char *name,
2113 u32 name_hash, 2899 u32 name_hash,
@@ -2115,11 +2901,9 @@ static int ocfs2_find_xe_in_bucket(struct inode *inode,
2115 int *found) 2901 int *found)
2116{ 2902{
2117 int i, ret = 0, cmp = 1, block_off, new_offset; 2903 int i, ret = 0, cmp = 1, block_off, new_offset;
2118 struct ocfs2_xattr_header *xh = 2904 struct ocfs2_xattr_header *xh = bucket_xh(bucket);
2119 (struct ocfs2_xattr_header *)header_bh->b_data;
2120 size_t name_len = strlen(name); 2905 size_t name_len = strlen(name);
2121 struct ocfs2_xattr_entry *xe = NULL; 2906 struct ocfs2_xattr_entry *xe = NULL;
2122 struct buffer_head *name_bh = NULL;
2123 char *xe_name; 2907 char *xe_name;
2124 2908
2125 /* 2909 /*
@@ -2150,19 +2934,9 @@ static int ocfs2_find_xe_in_bucket(struct inode *inode,
2150 break; 2934 break;
2151 } 2935 }
2152 2936
2153 ret = ocfs2_read_block(inode, header_bh->b_blocknr + block_off,
2154 &name_bh);
2155 if (ret) {
2156 mlog_errno(ret);
2157 break;
2158 }
2159 xe_name = name_bh->b_data + new_offset;
2160
2161 cmp = memcmp(name, xe_name, name_len);
2162 brelse(name_bh);
2163 name_bh = NULL;
2164 2937
2165 if (cmp == 0) { 2938 xe_name = bucket_block(bucket, block_off) + new_offset;
2939 if (!memcmp(name, xe_name, name_len)) {
2166 *xe_index = i; 2940 *xe_index = i;
2167 *found = 1; 2941 *found = 1;
2168 ret = 0; 2942 ret = 0;
@@ -2192,39 +2966,42 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
2192 struct ocfs2_xattr_search *xs) 2966 struct ocfs2_xattr_search *xs)
2193{ 2967{
2194 int ret, found = 0; 2968 int ret, found = 0;
2195 struct buffer_head *bh = NULL;
2196 struct buffer_head *lower_bh = NULL;
2197 struct ocfs2_xattr_header *xh = NULL; 2969 struct ocfs2_xattr_header *xh = NULL;
2198 struct ocfs2_xattr_entry *xe = NULL; 2970 struct ocfs2_xattr_entry *xe = NULL;
2199 u16 index = 0; 2971 u16 index = 0;
2200 u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); 2972 u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
2201 int low_bucket = 0, bucket, high_bucket; 2973 int low_bucket = 0, bucket, high_bucket;
2974 struct ocfs2_xattr_bucket *search;
2202 u32 last_hash; 2975 u32 last_hash;
2203 u64 blkno; 2976 u64 blkno, lower_blkno = 0;
2977
2978 search = ocfs2_xattr_bucket_new(inode);
2979 if (!search) {
2980 ret = -ENOMEM;
2981 mlog_errno(ret);
2982 goto out;
2983 }
2204 2984
2205 ret = ocfs2_read_block(inode, p_blkno, &bh); 2985 ret = ocfs2_read_xattr_bucket(search, p_blkno);
2206 if (ret) { 2986 if (ret) {
2207 mlog_errno(ret); 2987 mlog_errno(ret);
2208 goto out; 2988 goto out;
2209 } 2989 }
2210 2990
2211 xh = (struct ocfs2_xattr_header *)bh->b_data; 2991 xh = bucket_xh(search);
2212 high_bucket = le16_to_cpu(xh->xh_num_buckets) - 1; 2992 high_bucket = le16_to_cpu(xh->xh_num_buckets) - 1;
2213
2214 while (low_bucket <= high_bucket) { 2993 while (low_bucket <= high_bucket) {
2215 brelse(bh); 2994 ocfs2_xattr_bucket_relse(search);
2216 bh = NULL;
2217 bucket = (low_bucket + high_bucket) / 2;
2218 2995
2996 bucket = (low_bucket + high_bucket) / 2;
2219 blkno = p_blkno + bucket * blk_per_bucket; 2997 blkno = p_blkno + bucket * blk_per_bucket;
2220 2998 ret = ocfs2_read_xattr_bucket(search, blkno);
2221 ret = ocfs2_read_block(inode, blkno, &bh);
2222 if (ret) { 2999 if (ret) {
2223 mlog_errno(ret); 3000 mlog_errno(ret);
2224 goto out; 3001 goto out;
2225 } 3002 }
2226 3003
2227 xh = (struct ocfs2_xattr_header *)bh->b_data; 3004 xh = bucket_xh(search);
2228 xe = &xh->xh_entries[0]; 3005 xe = &xh->xh_entries[0];
2229 if (name_hash < le32_to_cpu(xe->xe_name_hash)) { 3006 if (name_hash < le32_to_cpu(xe->xe_name_hash)) {
2230 high_bucket = bucket - 1; 3007 high_bucket = bucket - 1;
@@ -2241,10 +3018,8 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
2241 3018
2242 last_hash = le32_to_cpu(xe->xe_name_hash); 3019 last_hash = le32_to_cpu(xe->xe_name_hash);
2243 3020
2244 /* record lower_bh which may be the insert place. */ 3021 /* record lower_blkno which may be the insert place. */
2245 brelse(lower_bh); 3022 lower_blkno = blkno;
2246 lower_bh = bh;
2247 bh = NULL;
2248 3023
2249 if (name_hash > le32_to_cpu(xe->xe_name_hash)) { 3024 if (name_hash > le32_to_cpu(xe->xe_name_hash)) {
2250 low_bucket = bucket + 1; 3025 low_bucket = bucket + 1;
@@ -2252,7 +3027,7 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
2252 } 3027 }
2253 3028
2254 /* the searched xattr should reside in this bucket if exists. */ 3029 /* the searched xattr should reside in this bucket if exists. */
2255 ret = ocfs2_find_xe_in_bucket(inode, lower_bh, 3030 ret = ocfs2_find_xe_in_bucket(inode, search,
2256 name_index, name, name_hash, 3031 name_index, name, name_hash,
2257 &index, &found); 3032 &index, &found);
2258 if (ret) { 3033 if (ret) {
@@ -2267,46 +3042,29 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
2267 * When the xattr's hash value is in the gap of 2 buckets, we will 3042 * When the xattr's hash value is in the gap of 2 buckets, we will
2268 * always set it to the previous bucket. 3043 * always set it to the previous bucket.
2269 */ 3044 */
2270 if (!lower_bh) { 3045 if (!lower_blkno)
2271 /* 3046 lower_blkno = p_blkno;
2272 * We can't find any bucket whose first name_hash is less 3047
2273 * than the find name_hash. 3048 /* This should be in cache - we just read it during the search */
2274 */ 3049 ret = ocfs2_read_xattr_bucket(xs->bucket, lower_blkno);
2275 BUG_ON(bh->b_blocknr != p_blkno); 3050 if (ret) {
2276 lower_bh = bh; 3051 mlog_errno(ret);
2277 bh = NULL; 3052 goto out;
2278 } 3053 }
2279 xs->bucket.bhs[0] = lower_bh;
2280 xs->bucket.xh = (struct ocfs2_xattr_header *)
2281 xs->bucket.bhs[0]->b_data;
2282 lower_bh = NULL;
2283 3054
2284 xs->header = xs->bucket.xh; 3055 xs->header = bucket_xh(xs->bucket);
2285 xs->base = xs->bucket.bhs[0]->b_data; 3056 xs->base = bucket_block(xs->bucket, 0);
2286 xs->end = xs->base + inode->i_sb->s_blocksize; 3057 xs->end = xs->base + inode->i_sb->s_blocksize;
2287 3058
2288 if (found) { 3059 if (found) {
2289 /*
2290 * If we have found the xattr enty, read all the blocks in
2291 * this bucket.
2292 */
2293 ret = ocfs2_read_blocks(inode, xs->bucket.bhs[0]->b_blocknr + 1,
2294 blk_per_bucket - 1, &xs->bucket.bhs[1],
2295 0);
2296 if (ret) {
2297 mlog_errno(ret);
2298 goto out;
2299 }
2300
2301 xs->here = &xs->header->xh_entries[index]; 3060 xs->here = &xs->header->xh_entries[index];
2302 mlog(0, "find xattr %s in bucket %llu, entry = %u\n", name, 3061 mlog(0, "find xattr %s in bucket %llu, entry = %u\n", name,
2303 (unsigned long long)xs->bucket.bhs[0]->b_blocknr, index); 3062 (unsigned long long)bucket_blkno(xs->bucket), index);
2304 } else 3063 } else
2305 ret = -ENODATA; 3064 ret = -ENODATA;
2306 3065
2307out: 3066out:
2308 brelse(bh); 3067 ocfs2_xattr_bucket_free(search);
2309 brelse(lower_bh);
2310 return ret; 3068 return ret;
2311} 3069}
2312 3070
@@ -2357,53 +3115,50 @@ static int ocfs2_iterate_xattr_buckets(struct inode *inode,
2357 xattr_bucket_func *func, 3115 xattr_bucket_func *func,
2358 void *para) 3116 void *para)
2359{ 3117{
2360 int i, j, ret = 0; 3118 int i, ret = 0;
2361 int blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
2362 u32 bpc = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)); 3119 u32 bpc = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb));
2363 u32 num_buckets = clusters * bpc; 3120 u32 num_buckets = clusters * bpc;
2364 struct ocfs2_xattr_bucket bucket; 3121 struct ocfs2_xattr_bucket *bucket;
2365 3122
2366 memset(&bucket, 0, sizeof(bucket)); 3123 bucket = ocfs2_xattr_bucket_new(inode);
3124 if (!bucket) {
3125 mlog_errno(-ENOMEM);
3126 return -ENOMEM;
3127 }
2367 3128
2368 mlog(0, "iterating xattr buckets in %u clusters starting from %llu\n", 3129 mlog(0, "iterating xattr buckets in %u clusters starting from %llu\n",
2369 clusters, (unsigned long long)blkno); 3130 clusters, (unsigned long long)blkno);
2370 3131
2371 for (i = 0; i < num_buckets; i++, blkno += blk_per_bucket) { 3132 for (i = 0; i < num_buckets; i++, blkno += bucket->bu_blocks) {
2372 ret = ocfs2_read_blocks(inode, blkno, blk_per_bucket, 3133 ret = ocfs2_read_xattr_bucket(bucket, blkno);
2373 bucket.bhs, 0);
2374 if (ret) { 3134 if (ret) {
2375 mlog_errno(ret); 3135 mlog_errno(ret);
2376 goto out; 3136 break;
2377 } 3137 }
2378 3138
2379 bucket.xh = (struct ocfs2_xattr_header *)bucket.bhs[0]->b_data;
2380 /* 3139 /*
2381 * The real bucket num in this series of blocks is stored 3140 * The real bucket num in this series of blocks is stored
2382 * in the 1st bucket. 3141 * in the 1st bucket.
2383 */ 3142 */
2384 if (i == 0) 3143 if (i == 0)
2385 num_buckets = le16_to_cpu(bucket.xh->xh_num_buckets); 3144 num_buckets = le16_to_cpu(bucket_xh(bucket)->xh_num_buckets);
2386 3145
2387 mlog(0, "iterating xattr bucket %llu, first hash %u\n", 3146 mlog(0, "iterating xattr bucket %llu, first hash %u\n",
2388 (unsigned long long)blkno, 3147 (unsigned long long)blkno,
2389 le32_to_cpu(bucket.xh->xh_entries[0].xe_name_hash)); 3148 le32_to_cpu(bucket_xh(bucket)->xh_entries[0].xe_name_hash));
2390 if (func) { 3149 if (func) {
2391 ret = func(inode, &bucket, para); 3150 ret = func(inode, bucket, para);
2392 if (ret) { 3151 if (ret)
2393 mlog_errno(ret); 3152 mlog_errno(ret);
2394 break; 3153 /* Fall through to bucket_relse() */
2395 }
2396 } 3154 }
2397 3155
2398 for (j = 0; j < blk_per_bucket; j++) 3156 ocfs2_xattr_bucket_relse(bucket);
2399 brelse(bucket.bhs[j]); 3157 if (ret)
2400 memset(&bucket, 0, sizeof(bucket)); 3158 break;
2401 } 3159 }
2402 3160
2403out: 3161 ocfs2_xattr_bucket_free(bucket);
2404 for (j = 0; j < blk_per_bucket; j++)
2405 brelse(bucket.bhs[j]);
2406
2407 return ret; 3162 return ret;
2408} 3163}
2409 3164
@@ -2441,21 +3196,21 @@ static int ocfs2_list_xattr_bucket(struct inode *inode,
2441 int i, block_off, new_offset; 3196 int i, block_off, new_offset;
2442 const char *prefix, *name; 3197 const char *prefix, *name;
2443 3198
2444 for (i = 0 ; i < le16_to_cpu(bucket->xh->xh_count); i++) { 3199 for (i = 0 ; i < le16_to_cpu(bucket_xh(bucket)->xh_count); i++) {
2445 struct ocfs2_xattr_entry *entry = &bucket->xh->xh_entries[i]; 3200 struct ocfs2_xattr_entry *entry = &bucket_xh(bucket)->xh_entries[i];
2446 type = ocfs2_xattr_get_type(entry); 3201 type = ocfs2_xattr_get_type(entry);
2447 prefix = ocfs2_xattr_prefix(type); 3202 prefix = ocfs2_xattr_prefix(type);
2448 3203
2449 if (prefix) { 3204 if (prefix) {
2450 ret = ocfs2_xattr_bucket_get_name_value(inode, 3205 ret = ocfs2_xattr_bucket_get_name_value(inode,
2451 bucket->xh, 3206 bucket_xh(bucket),
2452 i, 3207 i,
2453 &block_off, 3208 &block_off,
2454 &new_offset); 3209 &new_offset);
2455 if (ret) 3210 if (ret)
2456 break; 3211 break;
2457 3212
2458 name = (const char *)bucket->bhs[block_off]->b_data + 3213 name = (const char *)bucket_block(bucket, block_off) +
2459 new_offset; 3214 new_offset;
2460 ret = ocfs2_xattr_list_entry(xl->buffer, 3215 ret = ocfs2_xattr_list_entry(xl->buffer,
2461 xl->buffer_size, 3216 xl->buffer_size,
@@ -2540,32 +3295,34 @@ static void swap_xe(void *a, void *b, int size)
2540/* 3295/*
2541 * When the ocfs2_xattr_block is filled up, new bucket will be created 3296 * When the ocfs2_xattr_block is filled up, new bucket will be created
2542 * and all the xattr entries will be moved to the new bucket. 3297 * and all the xattr entries will be moved to the new bucket.
3298 * The header goes at the start of the bucket, and the names+values are
3299 * filled from the end. This is why *target starts as the last buffer.
2543 * Note: we need to sort the entries since they are not saved in order 3300 * Note: we need to sort the entries since they are not saved in order
2544 * in the ocfs2_xattr_block. 3301 * in the ocfs2_xattr_block.
2545 */ 3302 */
2546static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode, 3303static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
2547 struct buffer_head *xb_bh, 3304 struct buffer_head *xb_bh,
2548 struct buffer_head *xh_bh, 3305 struct ocfs2_xattr_bucket *bucket)
2549 struct buffer_head *data_bh)
2550{ 3306{
2551 int i, blocksize = inode->i_sb->s_blocksize; 3307 int i, blocksize = inode->i_sb->s_blocksize;
3308 int blks = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
2552 u16 offset, size, off_change; 3309 u16 offset, size, off_change;
2553 struct ocfs2_xattr_entry *xe; 3310 struct ocfs2_xattr_entry *xe;
2554 struct ocfs2_xattr_block *xb = 3311 struct ocfs2_xattr_block *xb =
2555 (struct ocfs2_xattr_block *)xb_bh->b_data; 3312 (struct ocfs2_xattr_block *)xb_bh->b_data;
2556 struct ocfs2_xattr_header *xb_xh = &xb->xb_attrs.xb_header; 3313 struct ocfs2_xattr_header *xb_xh = &xb->xb_attrs.xb_header;
2557 struct ocfs2_xattr_header *xh = 3314 struct ocfs2_xattr_header *xh = bucket_xh(bucket);
2558 (struct ocfs2_xattr_header *)xh_bh->b_data;
2559 u16 count = le16_to_cpu(xb_xh->xh_count); 3315 u16 count = le16_to_cpu(xb_xh->xh_count);
2560 char *target = xh_bh->b_data, *src = xb_bh->b_data; 3316 char *src = xb_bh->b_data;
3317 char *target = bucket_block(bucket, blks - 1);
2561 3318
2562 mlog(0, "cp xattr from block %llu to bucket %llu\n", 3319 mlog(0, "cp xattr from block %llu to bucket %llu\n",
2563 (unsigned long long)xb_bh->b_blocknr, 3320 (unsigned long long)xb_bh->b_blocknr,
2564 (unsigned long long)xh_bh->b_blocknr); 3321 (unsigned long long)bucket_blkno(bucket));
3322
3323 for (i = 0; i < blks; i++)
3324 memset(bucket_block(bucket, i), 0, blocksize);
2565 3325
2566 memset(xh_bh->b_data, 0, blocksize);
2567 if (data_bh)
2568 memset(data_bh->b_data, 0, blocksize);
2569 /* 3326 /*
2570 * Since the xe_name_offset is based on ocfs2_xattr_header, 3327 * Since the xe_name_offset is based on ocfs2_xattr_header,
2571 * there is a offset change corresponding to the change of 3328 * there is a offset change corresponding to the change of
@@ -2577,8 +3334,6 @@ static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
2577 size = blocksize - offset; 3334 size = blocksize - offset;
2578 3335
2579 /* copy all the names and values. */ 3336 /* copy all the names and values. */
2580 if (data_bh)
2581 target = data_bh->b_data;
2582 memcpy(target + offset, src + offset, size); 3337 memcpy(target + offset, src + offset, size);
2583 3338
2584 /* Init new header now. */ 3339 /* Init new header now. */
@@ -2588,7 +3343,7 @@ static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
2588 xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE - size); 3343 xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE - size);
2589 3344
2590 /* copy all the entries. */ 3345 /* copy all the entries. */
2591 target = xh_bh->b_data; 3346 target = bucket_block(bucket, 0);
2592 offset = offsetof(struct ocfs2_xattr_header, xh_entries); 3347 offset = offsetof(struct ocfs2_xattr_header, xh_entries);
2593 size = count * sizeof(struct ocfs2_xattr_entry); 3348 size = count * sizeof(struct ocfs2_xattr_entry);
2594 memcpy(target + offset, (char *)xb_xh + offset, size); 3349 memcpy(target + offset, (char *)xb_xh + offset, size);
@@ -2614,73 +3369,47 @@ static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
2614 * While if the entry is in index b-tree, "bucket" indicates the 3369 * While if the entry is in index b-tree, "bucket" indicates the
2615 * real place of the xattr. 3370 * real place of the xattr.
2616 */ 3371 */
2617static int ocfs2_xattr_update_xattr_search(struct inode *inode, 3372static void ocfs2_xattr_update_xattr_search(struct inode *inode,
2618 struct ocfs2_xattr_search *xs, 3373 struct ocfs2_xattr_search *xs,
2619 struct buffer_head *old_bh, 3374 struct buffer_head *old_bh)
2620 struct buffer_head *new_bh)
2621{ 3375{
2622 int ret = 0;
2623 char *buf = old_bh->b_data; 3376 char *buf = old_bh->b_data;
2624 struct ocfs2_xattr_block *old_xb = (struct ocfs2_xattr_block *)buf; 3377 struct ocfs2_xattr_block *old_xb = (struct ocfs2_xattr_block *)buf;
2625 struct ocfs2_xattr_header *old_xh = &old_xb->xb_attrs.xb_header; 3378 struct ocfs2_xattr_header *old_xh = &old_xb->xb_attrs.xb_header;
2626 int i, blocksize = inode->i_sb->s_blocksize; 3379 int i;
2627 u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
2628
2629 xs->bucket.bhs[0] = new_bh;
2630 get_bh(new_bh);
2631 xs->bucket.xh = (struct ocfs2_xattr_header *)xs->bucket.bhs[0]->b_data;
2632 xs->header = xs->bucket.xh;
2633 3380
2634 xs->base = new_bh->b_data; 3381 xs->header = bucket_xh(xs->bucket);
3382 xs->base = bucket_block(xs->bucket, 0);
2635 xs->end = xs->base + inode->i_sb->s_blocksize; 3383 xs->end = xs->base + inode->i_sb->s_blocksize;
2636 3384
2637 if (!xs->not_found) { 3385 if (xs->not_found)
2638 if (OCFS2_XATTR_BUCKET_SIZE != blocksize) { 3386 return;
2639 ret = ocfs2_read_blocks(inode,
2640 xs->bucket.bhs[0]->b_blocknr + 1,
2641 blk_per_bucket - 1, &xs->bucket.bhs[1],
2642 0);
2643 if (ret) {
2644 mlog_errno(ret);
2645 return ret;
2646 }
2647
2648 }
2649 i = xs->here - old_xh->xh_entries;
2650 xs->here = &xs->header->xh_entries[i];
2651 }
2652 3387
2653 return ret; 3388 i = xs->here - old_xh->xh_entries;
3389 xs->here = &xs->header->xh_entries[i];
2654} 3390}
2655 3391
2656static int ocfs2_xattr_create_index_block(struct inode *inode, 3392static int ocfs2_xattr_create_index_block(struct inode *inode,
2657 struct ocfs2_xattr_search *xs) 3393 struct ocfs2_xattr_search *xs,
3394 struct ocfs2_xattr_set_ctxt *ctxt)
2658{ 3395{
2659 int ret, credits = OCFS2_SUBALLOC_ALLOC; 3396 int ret;
2660 u32 bit_off, len; 3397 u32 bit_off, len;
2661 u64 blkno; 3398 u64 blkno;
2662 handle_t *handle; 3399 handle_t *handle = ctxt->handle;
2663 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 3400 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2664 struct ocfs2_inode_info *oi = OCFS2_I(inode); 3401 struct ocfs2_inode_info *oi = OCFS2_I(inode);
2665 struct ocfs2_alloc_context *data_ac;
2666 struct buffer_head *xh_bh = NULL, *data_bh = NULL;
2667 struct buffer_head *xb_bh = xs->xattr_bh; 3402 struct buffer_head *xb_bh = xs->xattr_bh;
2668 struct ocfs2_xattr_block *xb = 3403 struct ocfs2_xattr_block *xb =
2669 (struct ocfs2_xattr_block *)xb_bh->b_data; 3404 (struct ocfs2_xattr_block *)xb_bh->b_data;
2670 struct ocfs2_xattr_tree_root *xr; 3405 struct ocfs2_xattr_tree_root *xr;
2671 u16 xb_flags = le16_to_cpu(xb->xb_flags); 3406 u16 xb_flags = le16_to_cpu(xb->xb_flags);
2672 u16 bpb = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
2673 3407
2674 mlog(0, "create xattr index block for %llu\n", 3408 mlog(0, "create xattr index block for %llu\n",
2675 (unsigned long long)xb_bh->b_blocknr); 3409 (unsigned long long)xb_bh->b_blocknr);
2676 3410
2677 BUG_ON(xb_flags & OCFS2_XATTR_INDEXED); 3411 BUG_ON(xb_flags & OCFS2_XATTR_INDEXED);
2678 3412 BUG_ON(!xs->bucket);
2679 ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
2680 if (ret) {
2681 mlog_errno(ret);
2682 goto out;
2683 }
2684 3413
2685 /* 3414 /*
2686 * XXX: 3415 * XXX:
@@ -2689,29 +3418,18 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
2689 */ 3418 */
2690 down_write(&oi->ip_alloc_sem); 3419 down_write(&oi->ip_alloc_sem);
2691 3420
2692 /* 3421 ret = ocfs2_journal_access_xb(handle, inode, xb_bh,
2693 * 3 more credits, one for xattr block update, one for the 1st block 3422 OCFS2_JOURNAL_ACCESS_WRITE);
2694 * of the new xattr bucket and one for the value/data.
2695 */
2696 credits += 3;
2697 handle = ocfs2_start_trans(osb, credits);
2698 if (IS_ERR(handle)) {
2699 ret = PTR_ERR(handle);
2700 mlog_errno(ret);
2701 goto out_sem;
2702 }
2703
2704 ret = ocfs2_journal_access(handle, inode, xb_bh,
2705 OCFS2_JOURNAL_ACCESS_WRITE);
2706 if (ret) { 3423 if (ret) {
2707 mlog_errno(ret); 3424 mlog_errno(ret);
2708 goto out_commit; 3425 goto out;
2709 } 3426 }
2710 3427
2711 ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, &len); 3428 ret = __ocfs2_claim_clusters(osb, handle, ctxt->data_ac,
3429 1, 1, &bit_off, &len);
2712 if (ret) { 3430 if (ret) {
2713 mlog_errno(ret); 3431 mlog_errno(ret);
2714 goto out_commit; 3432 goto out;
2715 } 3433 }
2716 3434
2717 /* 3435 /*
@@ -2724,51 +3442,23 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
2724 mlog(0, "allocate 1 cluster from %llu to xattr block\n", 3442 mlog(0, "allocate 1 cluster from %llu to xattr block\n",
2725 (unsigned long long)blkno); 3443 (unsigned long long)blkno);
2726 3444
2727 xh_bh = sb_getblk(inode->i_sb, blkno); 3445 ret = ocfs2_init_xattr_bucket(xs->bucket, blkno);
2728 if (!xh_bh) { 3446 if (ret) {
2729 ret = -EIO;
2730 mlog_errno(ret); 3447 mlog_errno(ret);
2731 goto out_commit; 3448 goto out;
2732 } 3449 }
2733 3450
2734 ocfs2_set_new_buffer_uptodate(inode, xh_bh); 3451 ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket,
2735 3452 OCFS2_JOURNAL_ACCESS_CREATE);
2736 ret = ocfs2_journal_access(handle, inode, xh_bh,
2737 OCFS2_JOURNAL_ACCESS_CREATE);
2738 if (ret) { 3453 if (ret) {
2739 mlog_errno(ret); 3454 mlog_errno(ret);
2740 goto out_commit; 3455 goto out;
2741 }
2742
2743 if (bpb > 1) {
2744 data_bh = sb_getblk(inode->i_sb, blkno + bpb - 1);
2745 if (!data_bh) {
2746 ret = -EIO;
2747 mlog_errno(ret);
2748 goto out_commit;
2749 }
2750
2751 ocfs2_set_new_buffer_uptodate(inode, data_bh);
2752
2753 ret = ocfs2_journal_access(handle, inode, data_bh,
2754 OCFS2_JOURNAL_ACCESS_CREATE);
2755 if (ret) {
2756 mlog_errno(ret);
2757 goto out_commit;
2758 }
2759 } 3456 }
2760 3457
2761 ocfs2_cp_xattr_block_to_bucket(inode, xb_bh, xh_bh, data_bh); 3458 ocfs2_cp_xattr_block_to_bucket(inode, xb_bh, xs->bucket);
2762 3459 ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket);
2763 ocfs2_journal_dirty(handle, xh_bh);
2764 if (data_bh)
2765 ocfs2_journal_dirty(handle, data_bh);
2766 3460
2767 ret = ocfs2_xattr_update_xattr_search(inode, xs, xb_bh, xh_bh); 3461 ocfs2_xattr_update_xattr_search(inode, xs, xb_bh);
2768 if (ret) {
2769 mlog_errno(ret);
2770 goto out_commit;
2771 }
2772 3462
2773 /* Change from ocfs2_xattr_header to ocfs2_xattr_tree_root */ 3463 /* Change from ocfs2_xattr_header to ocfs2_xattr_tree_root */
2774 memset(&xb->xb_attrs, 0, inode->i_sb->s_blocksize - 3464 memset(&xb->xb_attrs, 0, inode->i_sb->s_blocksize -
@@ -2787,24 +3477,10 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
2787 3477
2788 xb->xb_flags = cpu_to_le16(xb_flags | OCFS2_XATTR_INDEXED); 3478 xb->xb_flags = cpu_to_le16(xb_flags | OCFS2_XATTR_INDEXED);
2789 3479
2790 ret = ocfs2_journal_dirty(handle, xb_bh); 3480 ocfs2_journal_dirty(handle, xb_bh);
2791 if (ret) {
2792 mlog_errno(ret);
2793 goto out_commit;
2794 }
2795
2796out_commit:
2797 ocfs2_commit_trans(osb, handle);
2798
2799out_sem:
2800 up_write(&oi->ip_alloc_sem);
2801 3481
2802out: 3482out:
2803 if (data_ac) 3483 up_write(&oi->ip_alloc_sem);
2804 ocfs2_free_alloc_context(data_ac);
2805
2806 brelse(xh_bh);
2807 brelse(data_bh);
2808 3484
2809 return ret; 3485 return ret;
2810} 3486}
@@ -2829,29 +3505,18 @@ static int cmp_xe_offset(const void *a, const void *b)
2829 * so that we can spare some space for insertion. 3505 * so that we can spare some space for insertion.
2830 */ 3506 */
2831static int ocfs2_defrag_xattr_bucket(struct inode *inode, 3507static int ocfs2_defrag_xattr_bucket(struct inode *inode,
3508 handle_t *handle,
2832 struct ocfs2_xattr_bucket *bucket) 3509 struct ocfs2_xattr_bucket *bucket)
2833{ 3510{
2834 int ret, i; 3511 int ret, i;
2835 size_t end, offset, len, value_len; 3512 size_t end, offset, len, value_len;
2836 struct ocfs2_xattr_header *xh; 3513 struct ocfs2_xattr_header *xh;
2837 char *entries, *buf, *bucket_buf = NULL; 3514 char *entries, *buf, *bucket_buf = NULL;
2838 u64 blkno = bucket->bhs[0]->b_blocknr; 3515 u64 blkno = bucket_blkno(bucket);
2839 u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
2840 u16 xh_free_start; 3516 u16 xh_free_start;
2841 size_t blocksize = inode->i_sb->s_blocksize; 3517 size_t blocksize = inode->i_sb->s_blocksize;
2842 handle_t *handle;
2843 struct buffer_head **bhs;
2844 struct ocfs2_xattr_entry *xe; 3518 struct ocfs2_xattr_entry *xe;
2845 3519
2846 bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket,
2847 GFP_NOFS);
2848 if (!bhs)
2849 return -ENOMEM;
2850
2851 ret = ocfs2_read_blocks(inode, blkno, blk_per_bucket, bhs, 0);
2852 if (ret)
2853 goto out;
2854
2855 /* 3520 /*
2856 * In order to make the operation more efficient and generic, 3521 * In order to make the operation more efficient and generic,
2857 * we copy all the blocks into a contiguous memory and do the 3522 * we copy all the blocks into a contiguous memory and do the
@@ -2865,26 +3530,16 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
2865 } 3530 }
2866 3531
2867 buf = bucket_buf; 3532 buf = bucket_buf;
2868 for (i = 0; i < blk_per_bucket; i++, buf += blocksize) 3533 for (i = 0; i < bucket->bu_blocks; i++, buf += blocksize)
2869 memcpy(buf, bhs[i]->b_data, blocksize); 3534 memcpy(buf, bucket_block(bucket, i), blocksize);
2870 3535
2871 handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), blk_per_bucket); 3536 ret = ocfs2_xattr_bucket_journal_access(handle, bucket,
2872 if (IS_ERR(handle)) { 3537 OCFS2_JOURNAL_ACCESS_WRITE);
2873 ret = PTR_ERR(handle); 3538 if (ret < 0) {
2874 handle = NULL;
2875 mlog_errno(ret); 3539 mlog_errno(ret);
2876 goto out; 3540 goto out;
2877 } 3541 }
2878 3542
2879 for (i = 0; i < blk_per_bucket; i++) {
2880 ret = ocfs2_journal_access(handle, inode, bhs[i],
2881 OCFS2_JOURNAL_ACCESS_WRITE);
2882 if (ret < 0) {
2883 mlog_errno(ret);
2884 goto commit;
2885 }
2886 }
2887
2888 xh = (struct ocfs2_xattr_header *)bucket_buf; 3543 xh = (struct ocfs2_xattr_header *)bucket_buf;
2889 entries = (char *)xh->xh_entries; 3544 entries = (char *)xh->xh_entries;
2890 xh_free_start = le16_to_cpu(xh->xh_free_start); 3545 xh_free_start = le16_to_cpu(xh->xh_free_start);
@@ -2940,7 +3595,7 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
2940 "bucket %llu\n", (unsigned long long)blkno); 3595 "bucket %llu\n", (unsigned long long)blkno);
2941 3596
2942 if (xh_free_start == end) 3597 if (xh_free_start == end)
2943 goto commit; 3598 goto out;
2944 3599
2945 memset(bucket_buf + xh_free_start, 0, end - xh_free_start); 3600 memset(bucket_buf + xh_free_start, 0, end - xh_free_start);
2946 xh->xh_free_start = cpu_to_le16(end); 3601 xh->xh_free_start = cpu_to_le16(end);
@@ -2951,169 +3606,94 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
2951 cmp_xe, swap_xe); 3606 cmp_xe, swap_xe);
2952 3607
2953 buf = bucket_buf; 3608 buf = bucket_buf;
2954 for (i = 0; i < blk_per_bucket; i++, buf += blocksize) { 3609 for (i = 0; i < bucket->bu_blocks; i++, buf += blocksize)
2955 memcpy(bhs[i]->b_data, buf, blocksize); 3610 memcpy(bucket_block(bucket, i), buf, blocksize);
2956 ocfs2_journal_dirty(handle, bhs[i]); 3611 ocfs2_xattr_bucket_journal_dirty(handle, bucket);
2957 }
2958 3612
2959commit:
2960 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
2961out: 3613out:
2962
2963 if (bhs) {
2964 for (i = 0; i < blk_per_bucket; i++)
2965 brelse(bhs[i]);
2966 }
2967 kfree(bhs);
2968
2969 kfree(bucket_buf); 3614 kfree(bucket_buf);
2970 return ret; 3615 return ret;
2971} 3616}
2972 3617
2973/* 3618/*
2974 * Move half nums of the xattr bucket in the previous cluster to this new 3619 * prev_blkno points to the start of an existing extent. new_blkno
2975 * cluster. We only touch the last cluster of the previous extend record. 3620 * points to a newly allocated extent. Because we know each of our
3621 * clusters contains more than bucket, we can easily split one cluster
3622 * at a bucket boundary. So we take the last cluster of the existing
3623 * extent and split it down the middle. We move the last half of the
3624 * buckets in the last cluster of the existing extent over to the new
3625 * extent.
3626 *
3627 * first_bh is the buffer at prev_blkno so we can update the existing
3628 * extent's bucket count. header_bh is the bucket were we were hoping
3629 * to insert our xattr. If the bucket move places the target in the new
3630 * extent, we'll update first_bh and header_bh after modifying the old
3631 * extent.
2976 * 3632 *
2977 * first_bh is the first buffer_head of a series of bucket in the same 3633 * first_hash will be set as the 1st xe's name_hash in the new extent.
2978 * extent rec and header_bh is the header of one bucket in this cluster.
2979 * They will be updated if we move the data header_bh contains to the new
2980 * cluster. first_hash will be set as the 1st xe's name_hash of the new cluster.
2981 */ 3634 */
2982static int ocfs2_mv_xattr_bucket_cross_cluster(struct inode *inode, 3635static int ocfs2_mv_xattr_bucket_cross_cluster(struct inode *inode,
2983 handle_t *handle, 3636 handle_t *handle,
2984 struct buffer_head **first_bh, 3637 struct ocfs2_xattr_bucket *first,
2985 struct buffer_head **header_bh, 3638 struct ocfs2_xattr_bucket *target,
2986 u64 new_blkno, 3639 u64 new_blkno,
2987 u64 prev_blkno,
2988 u32 num_clusters, 3640 u32 num_clusters,
2989 u32 *first_hash) 3641 u32 *first_hash)
2990{ 3642{
2991 int i, ret, credits; 3643 int ret;
2992 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 3644 struct super_block *sb = inode->i_sb;
2993 int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); 3645 int blks_per_bucket = ocfs2_blocks_per_xattr_bucket(sb);
2994 int num_buckets = ocfs2_xattr_buckets_per_cluster(osb); 3646 int num_buckets = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(sb));
2995 int blocksize = inode->i_sb->s_blocksize; 3647 int to_move = num_buckets / 2;
2996 struct buffer_head *old_bh, *new_bh, *prev_bh, *new_first_bh = NULL; 3648 u64 src_blkno;
2997 struct ocfs2_xattr_header *new_xh; 3649 u64 last_cluster_blkno = bucket_blkno(first) +
2998 struct ocfs2_xattr_header *xh = 3650 ((num_clusters - 1) * ocfs2_clusters_to_blocks(sb, 1));
2999 (struct ocfs2_xattr_header *)((*first_bh)->b_data);
3000
3001 BUG_ON(le16_to_cpu(xh->xh_num_buckets) < num_buckets);
3002 BUG_ON(OCFS2_XATTR_BUCKET_SIZE == osb->s_clustersize);
3003
3004 prev_bh = *first_bh;
3005 get_bh(prev_bh);
3006 xh = (struct ocfs2_xattr_header *)prev_bh->b_data;
3007 3651
3008 prev_blkno += (num_clusters - 1) * bpc + bpc / 2; 3652 BUG_ON(le16_to_cpu(bucket_xh(first)->xh_num_buckets) < num_buckets);
3653 BUG_ON(OCFS2_XATTR_BUCKET_SIZE == OCFS2_SB(sb)->s_clustersize);
3009 3654
3010 mlog(0, "move half of xattrs in cluster %llu to %llu\n", 3655 mlog(0, "move half of xattrs in cluster %llu to %llu\n",
3011 (unsigned long long)prev_blkno, (unsigned long long)new_blkno); 3656 (unsigned long long)last_cluster_blkno, (unsigned long long)new_blkno);
3012 3657
3013 /* 3658 ret = ocfs2_mv_xattr_buckets(inode, handle, bucket_blkno(first),
3014 * We need to update the 1st half of the new cluster and 3659 last_cluster_blkno, new_blkno,
3015 * 1 more for the update of the 1st bucket of the previous 3660 to_move, first_hash);
3016 * extent record.
3017 */
3018 credits = bpc / 2 + 1;
3019 ret = ocfs2_extend_trans(handle, credits);
3020 if (ret) { 3661 if (ret) {
3021 mlog_errno(ret); 3662 mlog_errno(ret);
3022 goto out; 3663 goto out;
3023 } 3664 }
3024 3665
3025 ret = ocfs2_journal_access(handle, inode, prev_bh, 3666 /* This is the first bucket that got moved */
3026 OCFS2_JOURNAL_ACCESS_WRITE); 3667 src_blkno = last_cluster_blkno + (to_move * blks_per_bucket);
3027 if (ret) {
3028 mlog_errno(ret);
3029 goto out;
3030 }
3031 3668
3032 for (i = 0; i < bpc / 2; i++, prev_blkno++, new_blkno++) { 3669 /*
3033 old_bh = new_bh = NULL; 3670 * If the target bucket was part of the moved buckets, we need to
3034 new_bh = sb_getblk(inode->i_sb, new_blkno); 3671 * update first and target.
3035 if (!new_bh) { 3672 */
3036 ret = -EIO; 3673 if (bucket_blkno(target) >= src_blkno) {
3037 mlog_errno(ret); 3674 /* Find the block for the new target bucket */
3038 goto out; 3675 src_blkno = new_blkno +
3039 } 3676 (bucket_blkno(target) - src_blkno);
3040 3677
3041 ocfs2_set_new_buffer_uptodate(inode, new_bh); 3678 ocfs2_xattr_bucket_relse(first);
3679 ocfs2_xattr_bucket_relse(target);
3042 3680
3043 ret = ocfs2_journal_access(handle, inode, new_bh, 3681 /*
3044 OCFS2_JOURNAL_ACCESS_CREATE); 3682 * These shouldn't fail - the buffers are in the
3045 if (ret < 0) { 3683 * journal from ocfs2_cp_xattr_bucket().
3684 */
3685 ret = ocfs2_read_xattr_bucket(first, new_blkno);
3686 if (ret) {
3046 mlog_errno(ret); 3687 mlog_errno(ret);
3047 brelse(new_bh);
3048 goto out; 3688 goto out;
3049 } 3689 }
3050 3690 ret = ocfs2_read_xattr_bucket(target, src_blkno);
3051 ret = ocfs2_read_block(inode, prev_blkno, &old_bh); 3691 if (ret)
3052 if (ret < 0) {
3053 mlog_errno(ret); 3692 mlog_errno(ret);
3054 brelse(new_bh);
3055 goto out;
3056 }
3057
3058 memcpy(new_bh->b_data, old_bh->b_data, blocksize);
3059 3693
3060 if (i == 0) {
3061 new_xh = (struct ocfs2_xattr_header *)new_bh->b_data;
3062 new_xh->xh_num_buckets = cpu_to_le16(num_buckets / 2);
3063
3064 if (first_hash)
3065 *first_hash = le32_to_cpu(
3066 new_xh->xh_entries[0].xe_name_hash);
3067 new_first_bh = new_bh;
3068 get_bh(new_first_bh);
3069 }
3070
3071 ocfs2_journal_dirty(handle, new_bh);
3072
3073 if (*header_bh == old_bh) {
3074 brelse(*header_bh);
3075 *header_bh = new_bh;
3076 get_bh(*header_bh);
3077
3078 brelse(*first_bh);
3079 *first_bh = new_first_bh;
3080 get_bh(*first_bh);
3081 }
3082 brelse(new_bh);
3083 brelse(old_bh);
3084 } 3694 }
3085 3695
3086 le16_add_cpu(&xh->xh_num_buckets, -(num_buckets / 2));
3087
3088 ocfs2_journal_dirty(handle, prev_bh);
3089out: 3696out:
3090 brelse(prev_bh);
3091 brelse(new_first_bh);
3092 return ret;
3093}
3094
3095static int ocfs2_read_xattr_bucket(struct inode *inode,
3096 u64 blkno,
3097 struct buffer_head **bhs,
3098 int new)
3099{
3100 int ret = 0;
3101 u16 i, blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
3102
3103 if (!new)
3104 return ocfs2_read_blocks(inode, blkno,
3105 blk_per_bucket, bhs, 0);
3106
3107 for (i = 0; i < blk_per_bucket; i++) {
3108 bhs[i] = sb_getblk(inode->i_sb, blkno + i);
3109 if (bhs[i] == NULL) {
3110 ret = -EIO;
3111 mlog_errno(ret);
3112 break;
3113 }
3114 ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
3115 }
3116
3117 return ret; 3697 return ret;
3118} 3698}
3119 3699
@@ -3178,8 +3758,7 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
3178{ 3758{
3179 int ret, i; 3759 int ret, i;
3180 int count, start, len, name_value_len = 0, xe_len, name_offset = 0; 3760 int count, start, len, name_value_len = 0, xe_len, name_offset = 0;
3181 u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); 3761 struct ocfs2_xattr_bucket *s_bucket = NULL, *t_bucket = NULL;
3182 struct buffer_head **s_bhs, **t_bhs = NULL;
3183 struct ocfs2_xattr_header *xh; 3762 struct ocfs2_xattr_header *xh;
3184 struct ocfs2_xattr_entry *xe; 3763 struct ocfs2_xattr_entry *xe;
3185 int blocksize = inode->i_sb->s_blocksize; 3764 int blocksize = inode->i_sb->s_blocksize;
@@ -3187,47 +3766,52 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
3187 mlog(0, "move some of xattrs from bucket %llu to %llu\n", 3766 mlog(0, "move some of xattrs from bucket %llu to %llu\n",
3188 (unsigned long long)blk, (unsigned long long)new_blk); 3767 (unsigned long long)blk, (unsigned long long)new_blk);
3189 3768
3190 s_bhs = kcalloc(blk_per_bucket, sizeof(struct buffer_head *), GFP_NOFS); 3769 s_bucket = ocfs2_xattr_bucket_new(inode);
3191 if (!s_bhs) 3770 t_bucket = ocfs2_xattr_bucket_new(inode);
3192 return -ENOMEM; 3771 if (!s_bucket || !t_bucket) {
3193 3772 ret = -ENOMEM;
3194 ret = ocfs2_read_xattr_bucket(inode, blk, s_bhs, 0);
3195 if (ret) {
3196 mlog_errno(ret); 3773 mlog_errno(ret);
3197 goto out; 3774 goto out;
3198 } 3775 }
3199 3776
3200 ret = ocfs2_journal_access(handle, inode, s_bhs[0], 3777 ret = ocfs2_read_xattr_bucket(s_bucket, blk);
3201 OCFS2_JOURNAL_ACCESS_WRITE);
3202 if (ret) { 3778 if (ret) {
3203 mlog_errno(ret); 3779 mlog_errno(ret);
3204 goto out; 3780 goto out;
3205 } 3781 }
3206 3782
3207 t_bhs = kcalloc(blk_per_bucket, sizeof(struct buffer_head *), GFP_NOFS); 3783 ret = ocfs2_xattr_bucket_journal_access(handle, s_bucket,
3208 if (!t_bhs) { 3784 OCFS2_JOURNAL_ACCESS_WRITE);
3209 ret = -ENOMEM; 3785 if (ret) {
3786 mlog_errno(ret);
3210 goto out; 3787 goto out;
3211 } 3788 }
3212 3789
3213 ret = ocfs2_read_xattr_bucket(inode, new_blk, t_bhs, new_bucket_head); 3790 /*
3791 * Even if !new_bucket_head, we're overwriting t_bucket. Thus,
3792 * there's no need to read it.
3793 */
3794 ret = ocfs2_init_xattr_bucket(t_bucket, new_blk);
3214 if (ret) { 3795 if (ret) {
3215 mlog_errno(ret); 3796 mlog_errno(ret);
3216 goto out; 3797 goto out;
3217 } 3798 }
3218 3799
3219 for (i = 0; i < blk_per_bucket; i++) { 3800 /*
3220 ret = ocfs2_journal_access(handle, inode, t_bhs[i], 3801 * Hey, if we're overwriting t_bucket, what difference does
3221 new_bucket_head ? 3802 * ACCESS_CREATE vs ACCESS_WRITE make? See the comment in the
3222 OCFS2_JOURNAL_ACCESS_CREATE : 3803 * same part of ocfs2_cp_xattr_bucket().
3223 OCFS2_JOURNAL_ACCESS_WRITE); 3804 */
3224 if (ret) { 3805 ret = ocfs2_xattr_bucket_journal_access(handle, t_bucket,
3225 mlog_errno(ret); 3806 new_bucket_head ?
3226 goto out; 3807 OCFS2_JOURNAL_ACCESS_CREATE :
3227 } 3808 OCFS2_JOURNAL_ACCESS_WRITE);
3809 if (ret) {
3810 mlog_errno(ret);
3811 goto out;
3228 } 3812 }
3229 3813
3230 xh = (struct ocfs2_xattr_header *)s_bhs[0]->b_data; 3814 xh = bucket_xh(s_bucket);
3231 count = le16_to_cpu(xh->xh_count); 3815 count = le16_to_cpu(xh->xh_count);
3232 start = ocfs2_xattr_find_divide_pos(xh); 3816 start = ocfs2_xattr_find_divide_pos(xh);
3233 3817
@@ -3239,10 +3823,10 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
3239 * The hash value is set as one larger than 3823 * The hash value is set as one larger than
3240 * that of the last entry in the previous bucket. 3824 * that of the last entry in the previous bucket.
3241 */ 3825 */
3242 for (i = 0; i < blk_per_bucket; i++) 3826 for (i = 0; i < t_bucket->bu_blocks; i++)
3243 memset(t_bhs[i]->b_data, 0, blocksize); 3827 memset(bucket_block(t_bucket, i), 0, blocksize);
3244 3828
3245 xh = (struct ocfs2_xattr_header *)t_bhs[0]->b_data; 3829 xh = bucket_xh(t_bucket);
3246 xh->xh_free_start = cpu_to_le16(blocksize); 3830 xh->xh_free_start = cpu_to_le16(blocksize);
3247 xh->xh_entries[0].xe_name_hash = xe->xe_name_hash; 3831 xh->xh_entries[0].xe_name_hash = xe->xe_name_hash;
3248 le32_add_cpu(&xh->xh_entries[0].xe_name_hash, 1); 3832 le32_add_cpu(&xh->xh_entries[0].xe_name_hash, 1);
@@ -3251,11 +3835,10 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
3251 } 3835 }
3252 3836
3253 /* copy the whole bucket to the new first. */ 3837 /* copy the whole bucket to the new first. */
3254 for (i = 0; i < blk_per_bucket; i++) 3838 ocfs2_xattr_bucket_copy_data(t_bucket, s_bucket);
3255 memcpy(t_bhs[i]->b_data, s_bhs[i]->b_data, blocksize);
3256 3839
3257 /* update the new bucket. */ 3840 /* update the new bucket. */
3258 xh = (struct ocfs2_xattr_header *)t_bhs[0]->b_data; 3841 xh = bucket_xh(t_bucket);
3259 3842
3260 /* 3843 /*
3261 * Calculate the total name/value len and xh_free_start for 3844 * Calculate the total name/value len and xh_free_start for
@@ -3319,11 +3902,7 @@ set_num_buckets:
3319 else 3902 else
3320 xh->xh_num_buckets = 0; 3903 xh->xh_num_buckets = 0;
3321 3904
3322 for (i = 0; i < blk_per_bucket; i++) { 3905 ocfs2_xattr_bucket_journal_dirty(handle, t_bucket);
3323 ocfs2_journal_dirty(handle, t_bhs[i]);
3324 if (ret)
3325 mlog_errno(ret);
3326 }
3327 3906
3328 /* store the first_hash of the new bucket. */ 3907 /* store the first_hash of the new bucket. */
3329 if (first_hash) 3908 if (first_hash)
@@ -3337,29 +3916,18 @@ set_num_buckets:
3337 if (start == count) 3916 if (start == count)
3338 goto out; 3917 goto out;
3339 3918
3340 xh = (struct ocfs2_xattr_header *)s_bhs[0]->b_data; 3919 xh = bucket_xh(s_bucket);
3341 memset(&xh->xh_entries[start], 0, 3920 memset(&xh->xh_entries[start], 0,
3342 sizeof(struct ocfs2_xattr_entry) * (count - start)); 3921 sizeof(struct ocfs2_xattr_entry) * (count - start));
3343 xh->xh_count = cpu_to_le16(start); 3922 xh->xh_count = cpu_to_le16(start);
3344 xh->xh_free_start = cpu_to_le16(name_offset); 3923 xh->xh_free_start = cpu_to_le16(name_offset);
3345 xh->xh_name_value_len = cpu_to_le16(name_value_len); 3924 xh->xh_name_value_len = cpu_to_le16(name_value_len);
3346 3925
3347 ocfs2_journal_dirty(handle, s_bhs[0]); 3926 ocfs2_xattr_bucket_journal_dirty(handle, s_bucket);
3348 if (ret)
3349 mlog_errno(ret);
3350 3927
3351out: 3928out:
3352 if (s_bhs) { 3929 ocfs2_xattr_bucket_free(s_bucket);
3353 for (i = 0; i < blk_per_bucket; i++) 3930 ocfs2_xattr_bucket_free(t_bucket);
3354 brelse(s_bhs[i]);
3355 }
3356 kfree(s_bhs);
3357
3358 if (t_bhs) {
3359 for (i = 0; i < blk_per_bucket; i++)
3360 brelse(t_bhs[i]);
3361 }
3362 kfree(t_bhs);
3363 3931
3364 return ret; 3932 return ret;
3365} 3933}
@@ -3376,10 +3944,8 @@ static int ocfs2_cp_xattr_bucket(struct inode *inode,
3376 u64 t_blkno, 3944 u64 t_blkno,
3377 int t_is_new) 3945 int t_is_new)
3378{ 3946{
3379 int ret, i; 3947 int ret;
3380 int blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); 3948 struct ocfs2_xattr_bucket *s_bucket = NULL, *t_bucket = NULL;
3381 int blocksize = inode->i_sb->s_blocksize;
3382 struct buffer_head **s_bhs, **t_bhs = NULL;
3383 3949
3384 BUG_ON(s_blkno == t_blkno); 3950 BUG_ON(s_blkno == t_blkno);
3385 3951
@@ -3387,92 +3953,115 @@ static int ocfs2_cp_xattr_bucket(struct inode *inode,
3387 (unsigned long long)s_blkno, (unsigned long long)t_blkno, 3953 (unsigned long long)s_blkno, (unsigned long long)t_blkno,
3388 t_is_new); 3954 t_is_new);
3389 3955
3390 s_bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket, 3956 s_bucket = ocfs2_xattr_bucket_new(inode);
3391 GFP_NOFS); 3957 t_bucket = ocfs2_xattr_bucket_new(inode);
3392 if (!s_bhs) 3958 if (!s_bucket || !t_bucket) {
3393 return -ENOMEM; 3959 ret = -ENOMEM;
3960 mlog_errno(ret);
3961 goto out;
3962 }
3394 3963
3395 ret = ocfs2_read_xattr_bucket(inode, s_blkno, s_bhs, 0); 3964 ret = ocfs2_read_xattr_bucket(s_bucket, s_blkno);
3396 if (ret) 3965 if (ret)
3397 goto out; 3966 goto out;
3398 3967
3399 t_bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket, 3968 /*
3400 GFP_NOFS); 3969 * Even if !t_is_new, we're overwriting t_bucket. Thus,
3401 if (!t_bhs) { 3970 * there's no need to read it.
3402 ret = -ENOMEM; 3971 */
3972 ret = ocfs2_init_xattr_bucket(t_bucket, t_blkno);
3973 if (ret)
3403 goto out; 3974 goto out;
3404 }
3405 3975
3406 ret = ocfs2_read_xattr_bucket(inode, t_blkno, t_bhs, t_is_new); 3976 /*
3977 * Hey, if we're overwriting t_bucket, what difference does
3978 * ACCESS_CREATE vs ACCESS_WRITE make? Well, if we allocated a new
3979 * cluster to fill, we came here from
3980 * ocfs2_mv_xattr_buckets(), and it is really new -
3981 * ACCESS_CREATE is required. But we also might have moved data
3982 * out of t_bucket before extending back into it.
3983 * ocfs2_add_new_xattr_bucket() can do this - its call to
3984 * ocfs2_add_new_xattr_cluster() may have created a new extent
3985 * and copied out the end of the old extent. Then it re-extends
3986 * the old extent back to create space for new xattrs. That's
3987 * how we get here, and the bucket isn't really new.
3988 */
3989 ret = ocfs2_xattr_bucket_journal_access(handle, t_bucket,
3990 t_is_new ?
3991 OCFS2_JOURNAL_ACCESS_CREATE :
3992 OCFS2_JOURNAL_ACCESS_WRITE);
3407 if (ret) 3993 if (ret)
3408 goto out; 3994 goto out;
3409 3995
3410 for (i = 0; i < blk_per_bucket; i++) { 3996 ocfs2_xattr_bucket_copy_data(t_bucket, s_bucket);
3411 ret = ocfs2_journal_access(handle, inode, t_bhs[i], 3997 ocfs2_xattr_bucket_journal_dirty(handle, t_bucket);
3412 t_is_new ?
3413 OCFS2_JOURNAL_ACCESS_CREATE :
3414 OCFS2_JOURNAL_ACCESS_WRITE);
3415 if (ret)
3416 goto out;
3417 }
3418
3419 for (i = 0; i < blk_per_bucket; i++) {
3420 memcpy(t_bhs[i]->b_data, s_bhs[i]->b_data, blocksize);
3421 ocfs2_journal_dirty(handle, t_bhs[i]);
3422 }
3423 3998
3424out: 3999out:
3425 if (s_bhs) { 4000 ocfs2_xattr_bucket_free(t_bucket);
3426 for (i = 0; i < blk_per_bucket; i++) 4001 ocfs2_xattr_bucket_free(s_bucket);
3427 brelse(s_bhs[i]);
3428 }
3429 kfree(s_bhs);
3430
3431 if (t_bhs) {
3432 for (i = 0; i < blk_per_bucket; i++)
3433 brelse(t_bhs[i]);
3434 }
3435 kfree(t_bhs);
3436 4002
3437 return ret; 4003 return ret;
3438} 4004}
3439 4005
3440/* 4006/*
3441 * Copy one xattr cluster from src_blk to to_blk. 4007 * src_blk points to the start of an existing extent. last_blk points to
3442 * The to_blk will become the first bucket header of the cluster, so its 4008 * last cluster in that extent. to_blk points to a newly allocated
3443 * xh_num_buckets will be initialized as the bucket num in the cluster. 4009 * extent. We copy the buckets from the cluster at last_blk to the new
4010 * extent. If start_bucket is non-zero, we skip that many buckets before
4011 * we start copying. The new extent's xh_num_buckets gets set to the
4012 * number of buckets we copied. The old extent's xh_num_buckets shrinks
4013 * by the same amount.
3444 */ 4014 */
3445static int ocfs2_cp_xattr_cluster(struct inode *inode, 4015static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle,
3446 handle_t *handle, 4016 u64 src_blk, u64 last_blk, u64 to_blk,
3447 struct buffer_head *first_bh, 4017 unsigned int start_bucket,
3448 u64 src_blk,
3449 u64 to_blk,
3450 u32 *first_hash) 4018 u32 *first_hash)
3451{ 4019{
3452 int i, ret, credits; 4020 int i, ret, credits;
3453 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 4021 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3454 int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); 4022 int blks_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
3455 int num_buckets = ocfs2_xattr_buckets_per_cluster(osb); 4023 int num_buckets = ocfs2_xattr_buckets_per_cluster(osb);
3456 struct buffer_head *bh = NULL; 4024 struct ocfs2_xattr_bucket *old_first, *new_first;
3457 struct ocfs2_xattr_header *xh; 4025
3458 u64 to_blk_start = to_blk; 4026 mlog(0, "mv xattrs from cluster %llu to %llu\n",
4027 (unsigned long long)last_blk, (unsigned long long)to_blk);
4028
4029 BUG_ON(start_bucket >= num_buckets);
4030 if (start_bucket) {
4031 num_buckets -= start_bucket;
4032 last_blk += (start_bucket * blks_per_bucket);
4033 }
4034
4035 /* The first bucket of the original extent */
4036 old_first = ocfs2_xattr_bucket_new(inode);
4037 /* The first bucket of the new extent */
4038 new_first = ocfs2_xattr_bucket_new(inode);
4039 if (!old_first || !new_first) {
4040 ret = -ENOMEM;
4041 mlog_errno(ret);
4042 goto out;
4043 }
3459 4044
3460 mlog(0, "cp xattrs from cluster %llu to %llu\n", 4045 ret = ocfs2_read_xattr_bucket(old_first, src_blk);
3461 (unsigned long long)src_blk, (unsigned long long)to_blk); 4046 if (ret) {
4047 mlog_errno(ret);
4048 goto out;
4049 }
3462 4050
3463 /* 4051 /*
3464 * We need to update the new cluster and 1 more for the update of 4052 * We need to update the first bucket of the old extent and all
3465 * the 1st bucket of the previous extent rec. 4053 * the buckets going to the new extent.
3466 */ 4054 */
3467 credits = bpc + 1; 4055 credits = ((num_buckets + 1) * blks_per_bucket) +
4056 handle->h_buffer_credits;
3468 ret = ocfs2_extend_trans(handle, credits); 4057 ret = ocfs2_extend_trans(handle, credits);
3469 if (ret) { 4058 if (ret) {
3470 mlog_errno(ret); 4059 mlog_errno(ret);
3471 goto out; 4060 goto out;
3472 } 4061 }
3473 4062
3474 ret = ocfs2_journal_access(handle, inode, first_bh, 4063 ret = ocfs2_xattr_bucket_journal_access(handle, old_first,
3475 OCFS2_JOURNAL_ACCESS_WRITE); 4064 OCFS2_JOURNAL_ACCESS_WRITE);
3476 if (ret) { 4065 if (ret) {
3477 mlog_errno(ret); 4066 mlog_errno(ret);
3478 goto out; 4067 goto out;
@@ -3480,45 +4069,45 @@ static int ocfs2_cp_xattr_cluster(struct inode *inode,
3480 4069
3481 for (i = 0; i < num_buckets; i++) { 4070 for (i = 0; i < num_buckets; i++) {
3482 ret = ocfs2_cp_xattr_bucket(inode, handle, 4071 ret = ocfs2_cp_xattr_bucket(inode, handle,
3483 src_blk, to_blk, 1); 4072 last_blk + (i * blks_per_bucket),
4073 to_blk + (i * blks_per_bucket),
4074 1);
3484 if (ret) { 4075 if (ret) {
3485 mlog_errno(ret); 4076 mlog_errno(ret);
3486 goto out; 4077 goto out;
3487 } 4078 }
3488
3489 src_blk += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
3490 to_blk += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
3491 } 4079 }
3492 4080
3493 /* update the old bucket header. */ 4081 /*
3494 xh = (struct ocfs2_xattr_header *)first_bh->b_data; 4082 * Get the new bucket ready before we dirty anything
3495 le16_add_cpu(&xh->xh_num_buckets, -num_buckets); 4083 * (This actually shouldn't fail, because we already dirtied
3496 4084 * it once in ocfs2_cp_xattr_bucket()).
3497 ocfs2_journal_dirty(handle, first_bh); 4085 */
3498 4086 ret = ocfs2_read_xattr_bucket(new_first, to_blk);
3499 /* update the new bucket header. */ 4087 if (ret) {
3500 ret = ocfs2_read_block(inode, to_blk_start, &bh);
3501 if (ret < 0) {
3502 mlog_errno(ret); 4088 mlog_errno(ret);
3503 goto out; 4089 goto out;
3504 } 4090 }
3505 4091 ret = ocfs2_xattr_bucket_journal_access(handle, new_first,
3506 ret = ocfs2_journal_access(handle, inode, bh, 4092 OCFS2_JOURNAL_ACCESS_WRITE);
3507 OCFS2_JOURNAL_ACCESS_WRITE);
3508 if (ret) { 4093 if (ret) {
3509 mlog_errno(ret); 4094 mlog_errno(ret);
3510 goto out; 4095 goto out;
3511 } 4096 }
3512 4097
3513 xh = (struct ocfs2_xattr_header *)bh->b_data; 4098 /* Now update the headers */
3514 xh->xh_num_buckets = cpu_to_le16(num_buckets); 4099 le16_add_cpu(&bucket_xh(old_first)->xh_num_buckets, -num_buckets);
4100 ocfs2_xattr_bucket_journal_dirty(handle, old_first);
3515 4101
3516 ocfs2_journal_dirty(handle, bh); 4102 bucket_xh(new_first)->xh_num_buckets = cpu_to_le16(num_buckets);
4103 ocfs2_xattr_bucket_journal_dirty(handle, new_first);
3517 4104
3518 if (first_hash) 4105 if (first_hash)
3519 *first_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash); 4106 *first_hash = le32_to_cpu(bucket_xh(new_first)->xh_entries[0].xe_name_hash);
4107
3520out: 4108out:
3521 brelse(bh); 4109 ocfs2_xattr_bucket_free(new_first);
4110 ocfs2_xattr_bucket_free(old_first);
3522 return ret; 4111 return ret;
3523} 4112}
3524 4113
@@ -3534,7 +4123,7 @@ static int ocfs2_divide_xattr_cluster(struct inode *inode,
3534 u32 *first_hash) 4123 u32 *first_hash)
3535{ 4124{
3536 u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); 4125 u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
3537 int ret, credits = 2 * blk_per_bucket; 4126 int ret, credits = 2 * blk_per_bucket + handle->h_buffer_credits;
3538 4127
3539 BUG_ON(OCFS2_XATTR_BUCKET_SIZE < OCFS2_SB(inode->i_sb)->s_clustersize); 4128 BUG_ON(OCFS2_XATTR_BUCKET_SIZE < OCFS2_SB(inode->i_sb)->s_clustersize);
3540 4129
@@ -3577,43 +4166,49 @@ static int ocfs2_divide_xattr_cluster(struct inode *inode,
3577 */ 4166 */
3578static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode, 4167static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode,
3579 handle_t *handle, 4168 handle_t *handle,
3580 struct buffer_head **first_bh, 4169 struct ocfs2_xattr_bucket *first,
3581 struct buffer_head **header_bh, 4170 struct ocfs2_xattr_bucket *target,
3582 u64 new_blk, 4171 u64 new_blk,
3583 u64 prev_blk,
3584 u32 prev_clusters, 4172 u32 prev_clusters,
3585 u32 *v_start, 4173 u32 *v_start,
3586 int *extend) 4174 int *extend)
3587{ 4175{
3588 int ret = 0; 4176 int ret;
3589 int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
3590 4177
3591 mlog(0, "adjust xattrs from cluster %llu len %u to %llu\n", 4178 mlog(0, "adjust xattrs from cluster %llu len %u to %llu\n",
3592 (unsigned long long)prev_blk, prev_clusters, 4179 (unsigned long long)bucket_blkno(first), prev_clusters,
3593 (unsigned long long)new_blk); 4180 (unsigned long long)new_blk);
3594 4181
3595 if (ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)) > 1) 4182 if (ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)) > 1) {
3596 ret = ocfs2_mv_xattr_bucket_cross_cluster(inode, 4183 ret = ocfs2_mv_xattr_bucket_cross_cluster(inode,
3597 handle, 4184 handle,
3598 first_bh, 4185 first, target,
3599 header_bh,
3600 new_blk, 4186 new_blk,
3601 prev_blk,
3602 prev_clusters, 4187 prev_clusters,
3603 v_start); 4188 v_start);
3604 else { 4189 if (ret)
3605 u64 last_blk = prev_blk + bpc * (prev_clusters - 1); 4190 mlog_errno(ret);
3606 4191 } else {
3607 if (prev_clusters > 1 && (*header_bh)->b_blocknr != last_blk) 4192 /* The start of the last cluster in the first extent */
3608 ret = ocfs2_cp_xattr_cluster(inode, handle, *first_bh, 4193 u64 last_blk = bucket_blkno(first) +
3609 last_blk, new_blk, 4194 ((prev_clusters - 1) *
4195 ocfs2_clusters_to_blocks(inode->i_sb, 1));
4196
4197 if (prev_clusters > 1 && bucket_blkno(target) != last_blk) {
4198 ret = ocfs2_mv_xattr_buckets(inode, handle,
4199 bucket_blkno(first),
4200 last_blk, new_blk, 0,
3610 v_start); 4201 v_start);
3611 else { 4202 if (ret)
4203 mlog_errno(ret);
4204 } else {
3612 ret = ocfs2_divide_xattr_cluster(inode, handle, 4205 ret = ocfs2_divide_xattr_cluster(inode, handle,
3613 last_blk, new_blk, 4206 last_blk, new_blk,
3614 v_start); 4207 v_start);
4208 if (ret)
4209 mlog_errno(ret);
3615 4210
3616 if ((*header_bh)->b_blocknr == last_blk && extend) 4211 if ((bucket_blkno(target) == last_blk) && extend)
3617 *extend = 0; 4212 *extend = 0;
3618 } 4213 }
3619 } 4214 }
@@ -3639,56 +4234,37 @@ static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode,
3639 */ 4234 */
3640static int ocfs2_add_new_xattr_cluster(struct inode *inode, 4235static int ocfs2_add_new_xattr_cluster(struct inode *inode,
3641 struct buffer_head *root_bh, 4236 struct buffer_head *root_bh,
3642 struct buffer_head **first_bh, 4237 struct ocfs2_xattr_bucket *first,
3643 struct buffer_head **header_bh, 4238 struct ocfs2_xattr_bucket *target,
3644 u32 *num_clusters, 4239 u32 *num_clusters,
3645 u32 prev_cpos, 4240 u32 prev_cpos,
3646 u64 prev_blkno, 4241 int *extend,
3647 int *extend) 4242 struct ocfs2_xattr_set_ctxt *ctxt)
3648{ 4243{
3649 int ret, credits; 4244 int ret;
3650 u16 bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); 4245 u16 bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
3651 u32 prev_clusters = *num_clusters; 4246 u32 prev_clusters = *num_clusters;
3652 u32 clusters_to_add = 1, bit_off, num_bits, v_start = 0; 4247 u32 clusters_to_add = 1, bit_off, num_bits, v_start = 0;
3653 u64 block; 4248 u64 block;
3654 handle_t *handle = NULL; 4249 handle_t *handle = ctxt->handle;
3655 struct ocfs2_alloc_context *data_ac = NULL;
3656 struct ocfs2_alloc_context *meta_ac = NULL;
3657 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 4250 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3658 struct ocfs2_extent_tree et; 4251 struct ocfs2_extent_tree et;
3659 4252
3660 mlog(0, "Add new xattr cluster for %llu, previous xattr hash = %u, " 4253 mlog(0, "Add new xattr cluster for %llu, previous xattr hash = %u, "
3661 "previous xattr blkno = %llu\n", 4254 "previous xattr blkno = %llu\n",
3662 (unsigned long long)OCFS2_I(inode)->ip_blkno, 4255 (unsigned long long)OCFS2_I(inode)->ip_blkno,
3663 prev_cpos, (unsigned long long)prev_blkno); 4256 prev_cpos, (unsigned long long)bucket_blkno(first));
3664 4257
3665 ocfs2_init_xattr_tree_extent_tree(&et, inode, root_bh); 4258 ocfs2_init_xattr_tree_extent_tree(&et, inode, root_bh);
3666 4259
3667 ret = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0, 4260 ret = ocfs2_journal_access_xb(handle, inode, root_bh,
3668 &data_ac, &meta_ac); 4261 OCFS2_JOURNAL_ACCESS_WRITE);
3669 if (ret) {
3670 mlog_errno(ret);
3671 goto leave;
3672 }
3673
3674 credits = ocfs2_calc_extend_credits(osb->sb, et.et_root_el,
3675 clusters_to_add);
3676 handle = ocfs2_start_trans(osb, credits);
3677 if (IS_ERR(handle)) {
3678 ret = PTR_ERR(handle);
3679 handle = NULL;
3680 mlog_errno(ret);
3681 goto leave;
3682 }
3683
3684 ret = ocfs2_journal_access(handle, inode, root_bh,
3685 OCFS2_JOURNAL_ACCESS_WRITE);
3686 if (ret < 0) { 4262 if (ret < 0) {
3687 mlog_errno(ret); 4263 mlog_errno(ret);
3688 goto leave; 4264 goto leave;
3689 } 4265 }
3690 4266
3691 ret = __ocfs2_claim_clusters(osb, handle, data_ac, 1, 4267 ret = __ocfs2_claim_clusters(osb, handle, ctxt->data_ac, 1,
3692 clusters_to_add, &bit_off, &num_bits); 4268 clusters_to_add, &bit_off, &num_bits);
3693 if (ret < 0) { 4269 if (ret < 0) {
3694 if (ret != -ENOSPC) 4270 if (ret != -ENOSPC)
@@ -3702,7 +4278,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
3702 mlog(0, "Allocating %u clusters at block %u for xattr in inode %llu\n", 4278 mlog(0, "Allocating %u clusters at block %u for xattr in inode %llu\n",
3703 num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno); 4279 num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
3704 4280
3705 if (prev_blkno + prev_clusters * bpc == block && 4281 if (bucket_blkno(first) + (prev_clusters * bpc) == block &&
3706 (prev_clusters + num_bits) << osb->s_clustersize_bits <= 4282 (prev_clusters + num_bits) << osb->s_clustersize_bits <=
3707 OCFS2_MAX_XATTR_TREE_LEAF_SIZE) { 4283 OCFS2_MAX_XATTR_TREE_LEAF_SIZE) {
3708 /* 4284 /*
@@ -3721,10 +4297,9 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
3721 } else { 4297 } else {
3722 ret = ocfs2_adjust_xattr_cross_cluster(inode, 4298 ret = ocfs2_adjust_xattr_cross_cluster(inode,
3723 handle, 4299 handle,
3724 first_bh, 4300 first,
3725 header_bh, 4301 target,
3726 block, 4302 block,
3727 prev_blkno,
3728 prev_clusters, 4303 prev_clusters,
3729 &v_start, 4304 &v_start,
3730 extend); 4305 extend);
@@ -3734,149 +4309,137 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
3734 } 4309 }
3735 } 4310 }
3736 4311
3737 if (handle->h_buffer_credits < credits) {
3738 /*
3739 * The journal has been restarted before, and don't
3740 * have enough space for the insertion, so extend it
3741 * here.
3742 */
3743 ret = ocfs2_extend_trans(handle, credits);
3744 if (ret) {
3745 mlog_errno(ret);
3746 goto leave;
3747 }
3748 }
3749 mlog(0, "Insert %u clusters at block %llu for xattr at %u\n", 4312 mlog(0, "Insert %u clusters at block %llu for xattr at %u\n",
3750 num_bits, (unsigned long long)block, v_start); 4313 num_bits, (unsigned long long)block, v_start);
3751 ret = ocfs2_insert_extent(osb, handle, inode, &et, v_start, block, 4314 ret = ocfs2_insert_extent(osb, handle, inode, &et, v_start, block,
3752 num_bits, 0, meta_ac); 4315 num_bits, 0, ctxt->meta_ac);
3753 if (ret < 0) { 4316 if (ret < 0) {
3754 mlog_errno(ret); 4317 mlog_errno(ret);
3755 goto leave; 4318 goto leave;
3756 } 4319 }
3757 4320
3758 ret = ocfs2_journal_dirty(handle, root_bh); 4321 ret = ocfs2_journal_dirty(handle, root_bh);
3759 if (ret < 0) { 4322 if (ret < 0)
3760 mlog_errno(ret); 4323 mlog_errno(ret);
3761 goto leave;
3762 }
3763 4324
3764leave: 4325leave:
3765 if (handle)
3766 ocfs2_commit_trans(osb, handle);
3767 if (data_ac)
3768 ocfs2_free_alloc_context(data_ac);
3769 if (meta_ac)
3770 ocfs2_free_alloc_context(meta_ac);
3771
3772 return ret; 4326 return ret;
3773} 4327}
3774 4328
3775/* 4329/*
3776 * Extend a new xattr bucket and move xattrs to the end one by one until 4330 * We are given an extent. 'first' is the bucket at the very front of
3777 * We meet with start_bh. Only move half of the xattrs to the bucket after it. 4331 * the extent. The extent has space for an additional bucket past
4332 * bucket_xh(first)->xh_num_buckets. 'target_blkno' is the block number
4333 * of the target bucket. We wish to shift every bucket past the target
4334 * down one, filling in that additional space. When we get back to the
4335 * target, we split the target between itself and the now-empty bucket
4336 * at target+1 (aka, target_blkno + blks_per_bucket).
3778 */ 4337 */
3779static int ocfs2_extend_xattr_bucket(struct inode *inode, 4338static int ocfs2_extend_xattr_bucket(struct inode *inode,
3780 struct buffer_head *first_bh, 4339 handle_t *handle,
3781 struct buffer_head *start_bh, 4340 struct ocfs2_xattr_bucket *first,
4341 u64 target_blk,
3782 u32 num_clusters) 4342 u32 num_clusters)
3783{ 4343{
3784 int ret, credits; 4344 int ret, credits;
3785 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 4345 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3786 u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); 4346 u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
3787 u64 start_blk = start_bh->b_blocknr, end_blk; 4347 u64 end_blk;
3788 u32 num_buckets = num_clusters * ocfs2_xattr_buckets_per_cluster(osb); 4348 u16 new_bucket = le16_to_cpu(bucket_xh(first)->xh_num_buckets);
3789 handle_t *handle;
3790 struct ocfs2_xattr_header *first_xh =
3791 (struct ocfs2_xattr_header *)first_bh->b_data;
3792 u16 bucket = le16_to_cpu(first_xh->xh_num_buckets);
3793 4349
3794 mlog(0, "extend xattr bucket in %llu, xattr extend rec starting " 4350 mlog(0, "extend xattr bucket in %llu, xattr extend rec starting "
3795 "from %llu, len = %u\n", (unsigned long long)start_blk, 4351 "from %llu, len = %u\n", (unsigned long long)target_blk,
3796 (unsigned long long)first_bh->b_blocknr, num_clusters); 4352 (unsigned long long)bucket_blkno(first), num_clusters);
3797 4353
3798 BUG_ON(bucket >= num_buckets); 4354 /* The extent must have room for an additional bucket */
4355 BUG_ON(new_bucket >=
4356 (num_clusters * ocfs2_xattr_buckets_per_cluster(osb)));
3799 4357
3800 end_blk = first_bh->b_blocknr + (bucket - 1) * blk_per_bucket; 4358 /* end_blk points to the last existing bucket */
4359 end_blk = bucket_blkno(first) + ((new_bucket - 1) * blk_per_bucket);
3801 4360
3802 /* 4361 /*
3803 * We will touch all the buckets after the start_bh(include it). 4362 * end_blk is the start of the last existing bucket.
3804 * Add one more bucket and modify the first_bh. 4363 * Thus, (end_blk - target_blk) covers the target bucket and
4364 * every bucket after it up to, but not including, the last
4365 * existing bucket. Then we add the last existing bucket, the
4366 * new bucket, and the first bucket (3 * blk_per_bucket).
3805 */ 4367 */
3806 credits = end_blk - start_blk + 2 * blk_per_bucket + 1; 4368 credits = (end_blk - target_blk) + (3 * blk_per_bucket) +
3807 handle = ocfs2_start_trans(osb, credits); 4369 handle->h_buffer_credits;
3808 if (IS_ERR(handle)) { 4370 ret = ocfs2_extend_trans(handle, credits);
3809 ret = PTR_ERR(handle); 4371 if (ret) {
3810 handle = NULL;
3811 mlog_errno(ret); 4372 mlog_errno(ret);
3812 goto out; 4373 goto out;
3813 } 4374 }
3814 4375
3815 ret = ocfs2_journal_access(handle, inode, first_bh, 4376 ret = ocfs2_xattr_bucket_journal_access(handle, first,
3816 OCFS2_JOURNAL_ACCESS_WRITE); 4377 OCFS2_JOURNAL_ACCESS_WRITE);
3817 if (ret) { 4378 if (ret) {
3818 mlog_errno(ret); 4379 mlog_errno(ret);
3819 goto commit; 4380 goto out;
3820 } 4381 }
3821 4382
3822 while (end_blk != start_blk) { 4383 while (end_blk != target_blk) {
3823 ret = ocfs2_cp_xattr_bucket(inode, handle, end_blk, 4384 ret = ocfs2_cp_xattr_bucket(inode, handle, end_blk,
3824 end_blk + blk_per_bucket, 0); 4385 end_blk + blk_per_bucket, 0);
3825 if (ret) 4386 if (ret)
3826 goto commit; 4387 goto out;
3827 end_blk -= blk_per_bucket; 4388 end_blk -= blk_per_bucket;
3828 } 4389 }
3829 4390
3830 /* Move half of the xattr in start_blk to the next bucket. */ 4391 /* Move half of the xattr in target_blkno to the next bucket. */
3831 ret = ocfs2_divide_xattr_bucket(inode, handle, start_blk, 4392 ret = ocfs2_divide_xattr_bucket(inode, handle, target_blk,
3832 start_blk + blk_per_bucket, NULL, 0); 4393 target_blk + blk_per_bucket, NULL, 0);
3833 4394
3834 le16_add_cpu(&first_xh->xh_num_buckets, 1); 4395 le16_add_cpu(&bucket_xh(first)->xh_num_buckets, 1);
3835 ocfs2_journal_dirty(handle, first_bh); 4396 ocfs2_xattr_bucket_journal_dirty(handle, first);
3836 4397
3837commit:
3838 ocfs2_commit_trans(osb, handle);
3839out: 4398out:
3840 return ret; 4399 return ret;
3841} 4400}
3842 4401
3843/* 4402/*
3844 * Add new xattr bucket in an extent record and adjust the buckets accordingly. 4403 * Add new xattr bucket in an extent record and adjust the buckets
3845 * xb_bh is the ocfs2_xattr_block. 4404 * accordingly. xb_bh is the ocfs2_xattr_block, and target is the
3846 * We will move all the buckets starting from header_bh to the next place. As 4405 * bucket we want to insert into.
3847 * for this one, half num of its xattrs will be moved to the next one. 4406 *
4407 * In the easy case, we will move all the buckets after target down by
4408 * one. Half of target's xattrs will be moved to the next bucket.
3848 * 4409 *
3849 * We will allocate a new cluster if current cluster is full and adjust 4410 * If current cluster is full, we'll allocate a new one. This may not
3850 * header_bh and first_bh if the insert place is moved to the new cluster. 4411 * be contiguous. The underlying calls will make sure that there is
4412 * space for the insert, shifting buckets around if necessary.
4413 * 'target' may be moved by those calls.
3851 */ 4414 */
3852static int ocfs2_add_new_xattr_bucket(struct inode *inode, 4415static int ocfs2_add_new_xattr_bucket(struct inode *inode,
3853 struct buffer_head *xb_bh, 4416 struct buffer_head *xb_bh,
3854 struct buffer_head *header_bh) 4417 struct ocfs2_xattr_bucket *target,
4418 struct ocfs2_xattr_set_ctxt *ctxt)
3855{ 4419{
3856 struct ocfs2_xattr_header *first_xh = NULL;
3857 struct buffer_head *first_bh = NULL;
3858 struct ocfs2_xattr_block *xb = 4420 struct ocfs2_xattr_block *xb =
3859 (struct ocfs2_xattr_block *)xb_bh->b_data; 4421 (struct ocfs2_xattr_block *)xb_bh->b_data;
3860 struct ocfs2_xattr_tree_root *xb_root = &xb->xb_attrs.xb_root; 4422 struct ocfs2_xattr_tree_root *xb_root = &xb->xb_attrs.xb_root;
3861 struct ocfs2_extent_list *el = &xb_root->xt_list; 4423 struct ocfs2_extent_list *el = &xb_root->xt_list;
3862 struct ocfs2_xattr_header *xh = 4424 u32 name_hash =
3863 (struct ocfs2_xattr_header *)header_bh->b_data; 4425 le32_to_cpu(bucket_xh(target)->xh_entries[0].xe_name_hash);
3864 u32 name_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash); 4426 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3865 struct super_block *sb = inode->i_sb;
3866 struct ocfs2_super *osb = OCFS2_SB(sb);
3867 int ret, num_buckets, extend = 1; 4427 int ret, num_buckets, extend = 1;
3868 u64 p_blkno; 4428 u64 p_blkno;
3869 u32 e_cpos, num_clusters; 4429 u32 e_cpos, num_clusters;
4430 /* The bucket at the front of the extent */
4431 struct ocfs2_xattr_bucket *first;
3870 4432
3871 mlog(0, "Add new xattr bucket starting form %llu\n", 4433 mlog(0, "Add new xattr bucket starting from %llu\n",
3872 (unsigned long long)header_bh->b_blocknr); 4434 (unsigned long long)bucket_blkno(target));
3873 4435
3874 /* 4436 /* The first bucket of the original extent */
3875 * Add refrence for header_bh here because it may be 4437 first = ocfs2_xattr_bucket_new(inode);
3876 * changed in ocfs2_add_new_xattr_cluster and we need 4438 if (!first) {
3877 * to free it in the end. 4439 ret = -ENOMEM;
3878 */ 4440 mlog_errno(ret);
3879 get_bh(header_bh); 4441 goto out;
4442 }
3880 4443
3881 ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, &e_cpos, 4444 ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, &e_cpos,
3882 &num_clusters, el); 4445 &num_clusters, el);
@@ -3885,40 +4448,45 @@ static int ocfs2_add_new_xattr_bucket(struct inode *inode,
3885 goto out; 4448 goto out;
3886 } 4449 }
3887 4450
3888 ret = ocfs2_read_block(inode, p_blkno, &first_bh); 4451 ret = ocfs2_read_xattr_bucket(first, p_blkno);
3889 if (ret) { 4452 if (ret) {
3890 mlog_errno(ret); 4453 mlog_errno(ret);
3891 goto out; 4454 goto out;
3892 } 4455 }
3893 4456
3894 num_buckets = ocfs2_xattr_buckets_per_cluster(osb) * num_clusters; 4457 num_buckets = ocfs2_xattr_buckets_per_cluster(osb) * num_clusters;
3895 first_xh = (struct ocfs2_xattr_header *)first_bh->b_data; 4458 if (num_buckets == le16_to_cpu(bucket_xh(first)->xh_num_buckets)) {
3896 4459 /*
3897 if (num_buckets == le16_to_cpu(first_xh->xh_num_buckets)) { 4460 * This can move first+target if the target bucket moves
4461 * to the new extent.
4462 */
3898 ret = ocfs2_add_new_xattr_cluster(inode, 4463 ret = ocfs2_add_new_xattr_cluster(inode,
3899 xb_bh, 4464 xb_bh,
3900 &first_bh, 4465 first,
3901 &header_bh, 4466 target,
3902 &num_clusters, 4467 &num_clusters,
3903 e_cpos, 4468 e_cpos,
3904 p_blkno, 4469 &extend,
3905 &extend); 4470 ctxt);
3906 if (ret) { 4471 if (ret) {
3907 mlog_errno(ret); 4472 mlog_errno(ret);
3908 goto out; 4473 goto out;
3909 } 4474 }
3910 } 4475 }
3911 4476
3912 if (extend) 4477 if (extend) {
3913 ret = ocfs2_extend_xattr_bucket(inode, 4478 ret = ocfs2_extend_xattr_bucket(inode,
3914 first_bh, 4479 ctxt->handle,
3915 header_bh, 4480 first,
4481 bucket_blkno(target),
3916 num_clusters); 4482 num_clusters);
3917 if (ret) 4483 if (ret)
3918 mlog_errno(ret); 4484 mlog_errno(ret);
4485 }
4486
3919out: 4487out:
3920 brelse(first_bh); 4488 ocfs2_xattr_bucket_free(first);
3921 brelse(header_bh); 4489
3922 return ret; 4490 return ret;
3923} 4491}
3924 4492
@@ -3929,7 +4497,7 @@ static inline char *ocfs2_xattr_bucket_get_val(struct inode *inode,
3929 int block_off = offs >> inode->i_sb->s_blocksize_bits; 4497 int block_off = offs >> inode->i_sb->s_blocksize_bits;
3930 4498
3931 offs = offs % inode->i_sb->s_blocksize; 4499 offs = offs % inode->i_sb->s_blocksize;
3932 return bucket->bhs[block_off]->b_data + offs; 4500 return bucket_block(bucket, block_off) + offs;
3933} 4501}
3934 4502
3935/* 4503/*
@@ -3984,7 +4552,7 @@ static void ocfs2_xattr_set_entry_normal(struct inode *inode,
3984 xe->xe_value_size = 0; 4552 xe->xe_value_size = 0;
3985 4553
3986 val = ocfs2_xattr_bucket_get_val(inode, 4554 val = ocfs2_xattr_bucket_get_val(inode,
3987 &xs->bucket, offs); 4555 xs->bucket, offs);
3988 memset(val + OCFS2_XATTR_SIZE(name_len), 0, 4556 memset(val + OCFS2_XATTR_SIZE(name_len), 0,
3989 size - OCFS2_XATTR_SIZE(name_len)); 4557 size - OCFS2_XATTR_SIZE(name_len));
3990 if (OCFS2_XATTR_SIZE(xi->value_len) > 0) 4558 if (OCFS2_XATTR_SIZE(xi->value_len) > 0)
@@ -4062,8 +4630,7 @@ set_new_name_value:
4062 xh->xh_free_start = cpu_to_le16(offs); 4630 xh->xh_free_start = cpu_to_le16(offs);
4063 } 4631 }
4064 4632
4065 val = ocfs2_xattr_bucket_get_val(inode, 4633 val = ocfs2_xattr_bucket_get_val(inode, xs->bucket, offs - size);
4066 &xs->bucket, offs - size);
4067 xe->xe_name_offset = cpu_to_le16(offs - size); 4634 xe->xe_name_offset = cpu_to_le16(offs - size);
4068 4635
4069 memset(val, 0, size); 4636 memset(val, 0, size);
@@ -4079,125 +4646,45 @@ set_new_name_value:
4079 return; 4646 return;
4080} 4647}
4081 4648
4082static int ocfs2_xattr_bucket_handle_journal(struct inode *inode,
4083 handle_t *handle,
4084 struct ocfs2_xattr_search *xs,
4085 struct buffer_head **bhs,
4086 u16 bh_num)
4087{
4088 int ret = 0, off, block_off;
4089 struct ocfs2_xattr_entry *xe = xs->here;
4090
4091 /*
4092 * First calculate all the blocks we should journal_access
4093 * and journal_dirty. The first block should always be touched.
4094 */
4095 ret = ocfs2_journal_dirty(handle, bhs[0]);
4096 if (ret)
4097 mlog_errno(ret);
4098
4099 /* calc the data. */
4100 off = le16_to_cpu(xe->xe_name_offset);
4101 block_off = off >> inode->i_sb->s_blocksize_bits;
4102 ret = ocfs2_journal_dirty(handle, bhs[block_off]);
4103 if (ret)
4104 mlog_errno(ret);
4105
4106 return ret;
4107}
4108
4109/* 4649/*
4110 * Set the xattr entry in the specified bucket. 4650 * Set the xattr entry in the specified bucket.
4111 * The bucket is indicated by xs->bucket and it should have the enough 4651 * The bucket is indicated by xs->bucket and it should have the enough
4112 * space for the xattr insertion. 4652 * space for the xattr insertion.
4113 */ 4653 */
4114static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode, 4654static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode,
4655 handle_t *handle,
4115 struct ocfs2_xattr_info *xi, 4656 struct ocfs2_xattr_info *xi,
4116 struct ocfs2_xattr_search *xs, 4657 struct ocfs2_xattr_search *xs,
4117 u32 name_hash, 4658 u32 name_hash,
4118 int local) 4659 int local)
4119{ 4660{
4120 int i, ret; 4661 int ret;
4121 handle_t *handle = NULL; 4662 u64 blkno;
4122 u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
4123 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
4124 4663
4125 mlog(0, "Set xattr entry len = %lu index = %d in bucket %llu\n", 4664 mlog(0, "Set xattr entry len = %lu index = %d in bucket %llu\n",
4126 (unsigned long)xi->value_len, xi->name_index, 4665 (unsigned long)xi->value_len, xi->name_index,
4127 (unsigned long long)xs->bucket.bhs[0]->b_blocknr); 4666 (unsigned long long)bucket_blkno(xs->bucket));
4128 4667
4129 if (!xs->bucket.bhs[1]) { 4668 if (!xs->bucket->bu_bhs[1]) {
4130 ret = ocfs2_read_blocks(inode, 4669 blkno = bucket_blkno(xs->bucket);
4131 xs->bucket.bhs[0]->b_blocknr + 1, 4670 ocfs2_xattr_bucket_relse(xs->bucket);
4132 blk_per_bucket - 1, &xs->bucket.bhs[1], 4671 ret = ocfs2_read_xattr_bucket(xs->bucket, blkno);
4133 0);
4134 if (ret) { 4672 if (ret) {
4135 mlog_errno(ret); 4673 mlog_errno(ret);
4136 goto out; 4674 goto out;
4137 } 4675 }
4138 } 4676 }
4139 4677
4140 handle = ocfs2_start_trans(osb, blk_per_bucket); 4678 ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket,
4141 if (IS_ERR(handle)) { 4679 OCFS2_JOURNAL_ACCESS_WRITE);
4142 ret = PTR_ERR(handle); 4680 if (ret < 0) {
4143 handle = NULL;
4144 mlog_errno(ret); 4681 mlog_errno(ret);
4145 goto out; 4682 goto out;
4146 } 4683 }
4147 4684
4148 for (i = 0; i < blk_per_bucket; i++) {
4149 ret = ocfs2_journal_access(handle, inode, xs->bucket.bhs[i],
4150 OCFS2_JOURNAL_ACCESS_WRITE);
4151 if (ret < 0) {
4152 mlog_errno(ret);
4153 goto out;
4154 }
4155 }
4156
4157 ocfs2_xattr_set_entry_normal(inode, xi, xs, name_hash, local); 4685 ocfs2_xattr_set_entry_normal(inode, xi, xs, name_hash, local);
4686 ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket);
4158 4687
4159 /*Only dirty the blocks we have touched in set xattr. */
4160 ret = ocfs2_xattr_bucket_handle_journal(inode, handle, xs,
4161 xs->bucket.bhs, blk_per_bucket);
4162 if (ret)
4163 mlog_errno(ret);
4164out:
4165 ocfs2_commit_trans(osb, handle);
4166
4167 return ret;
4168}
4169
4170static int ocfs2_xattr_value_update_size(struct inode *inode,
4171 struct buffer_head *xe_bh,
4172 struct ocfs2_xattr_entry *xe,
4173 u64 new_size)
4174{
4175 int ret;
4176 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
4177 handle_t *handle = NULL;
4178
4179 handle = ocfs2_start_trans(osb, 1);
4180 if (IS_ERR(handle)) {
4181 ret = -ENOMEM;
4182 mlog_errno(ret);
4183 goto out;
4184 }
4185
4186 ret = ocfs2_journal_access(handle, inode, xe_bh,
4187 OCFS2_JOURNAL_ACCESS_WRITE);
4188 if (ret < 0) {
4189 mlog_errno(ret);
4190 goto out_commit;
4191 }
4192
4193 xe->xe_value_size = cpu_to_le64(new_size);
4194
4195 ret = ocfs2_journal_dirty(handle, xe_bh);
4196 if (ret < 0)
4197 mlog_errno(ret);
4198
4199out_commit:
4200 ocfs2_commit_trans(osb, handle);
4201out: 4688out:
4202 return ret; 4689 return ret;
4203} 4690}
@@ -4210,18 +4697,19 @@ out:
4210 * Copy the new updated xe and xe_value_root to new_xe and new_xv if needed. 4697 * Copy the new updated xe and xe_value_root to new_xe and new_xv if needed.
4211 */ 4698 */
4212static int ocfs2_xattr_bucket_value_truncate(struct inode *inode, 4699static int ocfs2_xattr_bucket_value_truncate(struct inode *inode,
4213 struct buffer_head *header_bh, 4700 struct ocfs2_xattr_bucket *bucket,
4214 int xe_off, 4701 int xe_off,
4215 int len) 4702 int len,
4703 struct ocfs2_xattr_set_ctxt *ctxt)
4216{ 4704{
4217 int ret, offset; 4705 int ret, offset;
4218 u64 value_blk; 4706 u64 value_blk;
4219 struct buffer_head *value_bh = NULL;
4220 struct ocfs2_xattr_value_root *xv;
4221 struct ocfs2_xattr_entry *xe; 4707 struct ocfs2_xattr_entry *xe;
4222 struct ocfs2_xattr_header *xh = 4708 struct ocfs2_xattr_header *xh = bucket_xh(bucket);
4223 (struct ocfs2_xattr_header *)header_bh->b_data;
4224 size_t blocksize = inode->i_sb->s_blocksize; 4709 size_t blocksize = inode->i_sb->s_blocksize;
4710 struct ocfs2_xattr_value_buf vb = {
4711 .vb_access = ocfs2_journal_access,
4712 };
4225 4713
4226 xe = &xh->xh_entries[xe_off]; 4714 xe = &xh->xh_entries[xe_off];
4227 4715
@@ -4234,49 +4722,57 @@ static int ocfs2_xattr_bucket_value_truncate(struct inode *inode,
4234 4722
4235 /* We don't allow ocfs2_xattr_value to be stored in different block. */ 4723 /* We don't allow ocfs2_xattr_value to be stored in different block. */
4236 BUG_ON(value_blk != (offset + OCFS2_XATTR_ROOT_SIZE - 1) / blocksize); 4724 BUG_ON(value_blk != (offset + OCFS2_XATTR_ROOT_SIZE - 1) / blocksize);
4237 value_blk += header_bh->b_blocknr;
4238 4725
4239 ret = ocfs2_read_block(inode, value_blk, &value_bh); 4726 vb.vb_bh = bucket->bu_bhs[value_blk];
4240 if (ret) { 4727 BUG_ON(!vb.vb_bh);
4241 mlog_errno(ret);
4242 goto out;
4243 }
4244 4728
4245 xv = (struct ocfs2_xattr_value_root *) 4729 vb.vb_xv = (struct ocfs2_xattr_value_root *)
4246 (value_bh->b_data + offset % blocksize); 4730 (vb.vb_bh->b_data + offset % blocksize);
4247 4731
4732 /*
4733 * From here on out we have to dirty the bucket. The generic
4734 * value calls only modify one of the bucket's bhs, but we need
4735 * to send the bucket at once. So if they error, they *could* have
4736 * modified something. We have to assume they did, and dirty
4737 * the whole bucket. This leaves us in a consistent state.
4738 */
4248 mlog(0, "truncate %u in xattr bucket %llu to %d bytes.\n", 4739 mlog(0, "truncate %u in xattr bucket %llu to %d bytes.\n",
4249 xe_off, (unsigned long long)header_bh->b_blocknr, len); 4740 xe_off, (unsigned long long)bucket_blkno(bucket), len);
4250 ret = ocfs2_xattr_value_truncate(inode, value_bh, xv, len); 4741 ret = ocfs2_xattr_value_truncate(inode, &vb, len, ctxt);
4251 if (ret) { 4742 if (ret) {
4252 mlog_errno(ret); 4743 mlog_errno(ret);
4253 goto out; 4744 goto out;
4254 } 4745 }
4255 4746
4256 ret = ocfs2_xattr_value_update_size(inode, header_bh, xe, len); 4747 ret = ocfs2_xattr_bucket_journal_access(ctxt->handle, bucket,
4748 OCFS2_JOURNAL_ACCESS_WRITE);
4257 if (ret) { 4749 if (ret) {
4258 mlog_errno(ret); 4750 mlog_errno(ret);
4259 goto out; 4751 goto out;
4260 } 4752 }
4261 4753
4754 xe->xe_value_size = cpu_to_le64(len);
4755
4756 ocfs2_xattr_bucket_journal_dirty(ctxt->handle, bucket);
4757
4262out: 4758out:
4263 brelse(value_bh);
4264 return ret; 4759 return ret;
4265} 4760}
4266 4761
4267static int ocfs2_xattr_bucket_value_truncate_xs(struct inode *inode, 4762static int ocfs2_xattr_bucket_value_truncate_xs(struct inode *inode,
4268 struct ocfs2_xattr_search *xs, 4763 struct ocfs2_xattr_search *xs,
4269 int len) 4764 int len,
4765 struct ocfs2_xattr_set_ctxt *ctxt)
4270{ 4766{
4271 int ret, offset; 4767 int ret, offset;
4272 struct ocfs2_xattr_entry *xe = xs->here; 4768 struct ocfs2_xattr_entry *xe = xs->here;
4273 struct ocfs2_xattr_header *xh = (struct ocfs2_xattr_header *)xs->base; 4769 struct ocfs2_xattr_header *xh = (struct ocfs2_xattr_header *)xs->base;
4274 4770
4275 BUG_ON(!xs->bucket.bhs[0] || !xe || ocfs2_xattr_is_local(xe)); 4771 BUG_ON(!xs->bucket->bu_bhs[0] || !xe || ocfs2_xattr_is_local(xe));
4276 4772
4277 offset = xe - xh->xh_entries; 4773 offset = xe - xh->xh_entries;
4278 ret = ocfs2_xattr_bucket_value_truncate(inode, xs->bucket.bhs[0], 4774 ret = ocfs2_xattr_bucket_value_truncate(inode, xs->bucket,
4279 offset, len); 4775 offset, len, ctxt);
4280 if (ret) 4776 if (ret)
4281 mlog_errno(ret); 4777 mlog_errno(ret);
4282 4778
@@ -4284,6 +4780,7 @@ static int ocfs2_xattr_bucket_value_truncate_xs(struct inode *inode,
4284} 4780}
4285 4781
4286static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode, 4782static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode,
4783 handle_t *handle,
4287 struct ocfs2_xattr_search *xs, 4784 struct ocfs2_xattr_search *xs,
4288 char *val, 4785 char *val,
4289 int value_len) 4786 int value_len)
@@ -4299,7 +4796,8 @@ static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode,
4299 4796
4300 xv = (struct ocfs2_xattr_value_root *)(xs->base + offset); 4797 xv = (struct ocfs2_xattr_value_root *)(xs->base + offset);
4301 4798
4302 return __ocfs2_xattr_set_value_outside(inode, xv, val, value_len); 4799 return __ocfs2_xattr_set_value_outside(inode, handle,
4800 xv, val, value_len);
4303} 4801}
4304 4802
4305static int ocfs2_rm_xattr_cluster(struct inode *inode, 4803static int ocfs2_rm_xattr_cluster(struct inode *inode,
@@ -4343,15 +4841,15 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
4343 } 4841 }
4344 } 4842 }
4345 4843
4346 handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS); 4844 handle = ocfs2_start_trans(osb, ocfs2_remove_extent_credits(osb->sb));
4347 if (IS_ERR(handle)) { 4845 if (IS_ERR(handle)) {
4348 ret = -ENOMEM; 4846 ret = -ENOMEM;
4349 mlog_errno(ret); 4847 mlog_errno(ret);
4350 goto out; 4848 goto out;
4351 } 4849 }
4352 4850
4353 ret = ocfs2_journal_access(handle, inode, root_bh, 4851 ret = ocfs2_journal_access_xb(handle, inode, root_bh,
4354 OCFS2_JOURNAL_ACCESS_WRITE); 4852 OCFS2_JOURNAL_ACCESS_WRITE);
4355 if (ret) { 4853 if (ret) {
4356 mlog_errno(ret); 4854 mlog_errno(ret);
4357 goto out_commit; 4855 goto out_commit;
@@ -4392,26 +4890,19 @@ out:
4392} 4890}
4393 4891
4394static void ocfs2_xattr_bucket_remove_xs(struct inode *inode, 4892static void ocfs2_xattr_bucket_remove_xs(struct inode *inode,
4893 handle_t *handle,
4395 struct ocfs2_xattr_search *xs) 4894 struct ocfs2_xattr_search *xs)
4396{ 4895{
4397 handle_t *handle = NULL; 4896 struct ocfs2_xattr_header *xh = bucket_xh(xs->bucket);
4398 struct ocfs2_xattr_header *xh = xs->bucket.xh;
4399 struct ocfs2_xattr_entry *last = &xh->xh_entries[ 4897 struct ocfs2_xattr_entry *last = &xh->xh_entries[
4400 le16_to_cpu(xh->xh_count) - 1]; 4898 le16_to_cpu(xh->xh_count) - 1];
4401 int ret = 0; 4899 int ret = 0;
4402 4900
4403 handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), 1); 4901 ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket,
4404 if (IS_ERR(handle)) { 4902 OCFS2_JOURNAL_ACCESS_WRITE);
4405 ret = PTR_ERR(handle);
4406 mlog_errno(ret);
4407 return;
4408 }
4409
4410 ret = ocfs2_journal_access(handle, inode, xs->bucket.bhs[0],
4411 OCFS2_JOURNAL_ACCESS_WRITE);
4412 if (ret) { 4903 if (ret) {
4413 mlog_errno(ret); 4904 mlog_errno(ret);
4414 goto out_commit; 4905 return;
4415 } 4906 }
4416 4907
4417 /* Remove the old entry. */ 4908 /* Remove the old entry. */
@@ -4420,11 +4911,7 @@ static void ocfs2_xattr_bucket_remove_xs(struct inode *inode,
4420 memset(last, 0, sizeof(struct ocfs2_xattr_entry)); 4911 memset(last, 0, sizeof(struct ocfs2_xattr_entry));
4421 le16_add_cpu(&xh->xh_count, -1); 4912 le16_add_cpu(&xh->xh_count, -1);
4422 4913
4423 ret = ocfs2_journal_dirty(handle, xs->bucket.bhs[0]); 4914 ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket);
4424 if (ret < 0)
4425 mlog_errno(ret);
4426out_commit:
4427 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
4428} 4915}
4429 4916
4430/* 4917/*
@@ -4440,7 +4927,8 @@ out_commit:
4440 */ 4927 */
4441static int ocfs2_xattr_set_in_bucket(struct inode *inode, 4928static int ocfs2_xattr_set_in_bucket(struct inode *inode,
4442 struct ocfs2_xattr_info *xi, 4929 struct ocfs2_xattr_info *xi,
4443 struct ocfs2_xattr_search *xs) 4930 struct ocfs2_xattr_search *xs,
4931 struct ocfs2_xattr_set_ctxt *ctxt)
4444{ 4932{
4445 int ret, local = 1; 4933 int ret, local = 1;
4446 size_t value_len; 4934 size_t value_len;
@@ -4468,7 +4956,8 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode,
4468 value_len = 0; 4956 value_len = 0;
4469 4957
4470 ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs, 4958 ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs,
4471 value_len); 4959 value_len,
4960 ctxt);
4472 if (ret) 4961 if (ret)
4473 goto out; 4962 goto out;
4474 4963
@@ -4488,7 +4977,8 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode,
4488 xi->value_len = OCFS2_XATTR_ROOT_SIZE; 4977 xi->value_len = OCFS2_XATTR_ROOT_SIZE;
4489 } 4978 }
4490 4979
4491 ret = ocfs2_xattr_set_entry_in_bucket(inode, xi, xs, name_hash, local); 4980 ret = ocfs2_xattr_set_entry_in_bucket(inode, ctxt->handle, xi, xs,
4981 name_hash, local);
4492 if (ret) { 4982 if (ret) {
4493 mlog_errno(ret); 4983 mlog_errno(ret);
4494 goto out; 4984 goto out;
@@ -4499,7 +4989,7 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode,
4499 4989
4500 /* allocate the space now for the outside block storage. */ 4990 /* allocate the space now for the outside block storage. */
4501 ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs, 4991 ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs,
4502 value_len); 4992 value_len, ctxt);
4503 if (ret) { 4993 if (ret) {
4504 mlog_errno(ret); 4994 mlog_errno(ret);
4505 4995
@@ -4509,13 +4999,14 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode,
4509 * storage and we have allocated xattr already, 4999 * storage and we have allocated xattr already,
4510 * so need to remove it. 5000 * so need to remove it.
4511 */ 5001 */
4512 ocfs2_xattr_bucket_remove_xs(inode, xs); 5002 ocfs2_xattr_bucket_remove_xs(inode, ctxt->handle, xs);
4513 } 5003 }
4514 goto out; 5004 goto out;
4515 } 5005 }
4516 5006
4517set_value_outside: 5007set_value_outside:
4518 ret = ocfs2_xattr_bucket_set_value_outside(inode, xs, val, value_len); 5008 ret = ocfs2_xattr_bucket_set_value_outside(inode, ctxt->handle,
5009 xs, val, value_len);
4519out: 5010out:
4520 return ret; 5011 return ret;
4521} 5012}
@@ -4530,7 +5021,7 @@ static int ocfs2_check_xattr_bucket_collision(struct inode *inode,
4530 struct ocfs2_xattr_bucket *bucket, 5021 struct ocfs2_xattr_bucket *bucket,
4531 const char *name) 5022 const char *name)
4532{ 5023{
4533 struct ocfs2_xattr_header *xh = bucket->xh; 5024 struct ocfs2_xattr_header *xh = bucket_xh(bucket);
4534 u32 name_hash = ocfs2_xattr_name_hash(inode, name, strlen(name)); 5025 u32 name_hash = ocfs2_xattr_name_hash(inode, name, strlen(name));
4535 5026
4536 if (name_hash != le32_to_cpu(xh->xh_entries[0].xe_name_hash)) 5027 if (name_hash != le32_to_cpu(xh->xh_entries[0].xe_name_hash))
@@ -4540,7 +5031,7 @@ static int ocfs2_check_xattr_bucket_collision(struct inode *inode,
4540 xh->xh_entries[0].xe_name_hash) { 5031 xh->xh_entries[0].xe_name_hash) {
4541 mlog(ML_ERROR, "Too much hash collision in xattr bucket %llu, " 5032 mlog(ML_ERROR, "Too much hash collision in xattr bucket %llu, "
4542 "hash = %u\n", 5033 "hash = %u\n",
4543 (unsigned long long)bucket->bhs[0]->b_blocknr, 5034 (unsigned long long)bucket_blkno(bucket),
4544 le32_to_cpu(xh->xh_entries[0].xe_name_hash)); 5035 le32_to_cpu(xh->xh_entries[0].xe_name_hash));
4545 return -ENOSPC; 5036 return -ENOSPC;
4546 } 5037 }
@@ -4550,16 +5041,16 @@ static int ocfs2_check_xattr_bucket_collision(struct inode *inode,
4550 5041
4551static int ocfs2_xattr_set_entry_index_block(struct inode *inode, 5042static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
4552 struct ocfs2_xattr_info *xi, 5043 struct ocfs2_xattr_info *xi,
4553 struct ocfs2_xattr_search *xs) 5044 struct ocfs2_xattr_search *xs,
5045 struct ocfs2_xattr_set_ctxt *ctxt)
4554{ 5046{
4555 struct ocfs2_xattr_header *xh; 5047 struct ocfs2_xattr_header *xh;
4556 struct ocfs2_xattr_entry *xe; 5048 struct ocfs2_xattr_entry *xe;
4557 u16 count, header_size, xh_free_start; 5049 u16 count, header_size, xh_free_start;
4558 int i, free, max_free, need, old; 5050 int free, max_free, need, old;
4559 size_t value_size = 0, name_len = strlen(xi->name); 5051 size_t value_size = 0, name_len = strlen(xi->name);
4560 size_t blocksize = inode->i_sb->s_blocksize; 5052 size_t blocksize = inode->i_sb->s_blocksize;
4561 int ret, allocation = 0; 5053 int ret, allocation = 0;
4562 u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
4563 5054
4564 mlog_entry("Set xattr %s in xattr index block\n", xi->name); 5055 mlog_entry("Set xattr %s in xattr index block\n", xi->name);
4565 5056
@@ -4574,7 +5065,7 @@ try_again:
4574 5065
4575 mlog_bug_on_msg(header_size > blocksize, "bucket %llu has header size " 5066 mlog_bug_on_msg(header_size > blocksize, "bucket %llu has header size "
4576 "of %u which exceed block size\n", 5067 "of %u which exceed block size\n",
4577 (unsigned long long)xs->bucket.bhs[0]->b_blocknr, 5068 (unsigned long long)bucket_blkno(xs->bucket),
4578 header_size); 5069 header_size);
4579 5070
4580 if (xi->value && xi->value_len > OCFS2_XATTR_INLINE_SIZE) 5071 if (xi->value && xi->value_len > OCFS2_XATTR_INLINE_SIZE)
@@ -4614,11 +5105,13 @@ try_again:
4614 mlog(0, "xs->not_found = %d, in xattr bucket %llu: free = %d, " 5105 mlog(0, "xs->not_found = %d, in xattr bucket %llu: free = %d, "
4615 "need = %d, max_free = %d, xh_free_start = %u, xh_name_value_len =" 5106 "need = %d, max_free = %d, xh_free_start = %u, xh_name_value_len ="
4616 " %u\n", xs->not_found, 5107 " %u\n", xs->not_found,
4617 (unsigned long long)xs->bucket.bhs[0]->b_blocknr, 5108 (unsigned long long)bucket_blkno(xs->bucket),
4618 free, need, max_free, le16_to_cpu(xh->xh_free_start), 5109 free, need, max_free, le16_to_cpu(xh->xh_free_start),
4619 le16_to_cpu(xh->xh_name_value_len)); 5110 le16_to_cpu(xh->xh_name_value_len));
4620 5111
4621 if (free < need || count == ocfs2_xattr_max_xe_in_bucket(inode->i_sb)) { 5112 if (free < need ||
5113 (xs->not_found &&
5114 count == ocfs2_xattr_max_xe_in_bucket(inode->i_sb))) {
4622 if (need <= max_free && 5115 if (need <= max_free &&
4623 count < ocfs2_xattr_max_xe_in_bucket(inode->i_sb)) { 5116 count < ocfs2_xattr_max_xe_in_bucket(inode->i_sb)) {
4624 /* 5117 /*
@@ -4626,7 +5119,8 @@ try_again:
4626 * name/value will be moved, the xe shouldn't be changed 5119 * name/value will be moved, the xe shouldn't be changed
4627 * in xs. 5120 * in xs.
4628 */ 5121 */
4629 ret = ocfs2_defrag_xattr_bucket(inode, &xs->bucket); 5122 ret = ocfs2_defrag_xattr_bucket(inode, ctxt->handle,
5123 xs->bucket);
4630 if (ret) { 5124 if (ret) {
4631 mlog_errno(ret); 5125 mlog_errno(ret);
4632 goto out; 5126 goto out;
@@ -4658,7 +5152,7 @@ try_again:
4658 * add a new bucket for the insert. 5152 * add a new bucket for the insert.
4659 */ 5153 */
4660 ret = ocfs2_check_xattr_bucket_collision(inode, 5154 ret = ocfs2_check_xattr_bucket_collision(inode,
4661 &xs->bucket, 5155 xs->bucket,
4662 xi->name); 5156 xi->name);
4663 if (ret) { 5157 if (ret) {
4664 mlog_errno(ret); 5158 mlog_errno(ret);
@@ -4667,17 +5161,21 @@ try_again:
4667 5161
4668 ret = ocfs2_add_new_xattr_bucket(inode, 5162 ret = ocfs2_add_new_xattr_bucket(inode,
4669 xs->xattr_bh, 5163 xs->xattr_bh,
4670 xs->bucket.bhs[0]); 5164 xs->bucket,
5165 ctxt);
4671 if (ret) { 5166 if (ret) {
4672 mlog_errno(ret); 5167 mlog_errno(ret);
4673 goto out; 5168 goto out;
4674 } 5169 }
4675 5170
4676 for (i = 0; i < blk_per_bucket; i++) 5171 /*
4677 brelse(xs->bucket.bhs[i]); 5172 * ocfs2_add_new_xattr_bucket() will have updated
4678 5173 * xs->bucket if it moved, but it will not have updated
4679 memset(&xs->bucket, 0, sizeof(xs->bucket)); 5174 * any of the other search fields. Thus, we drop it and
4680 5175 * re-search. Everything should be cached, so it'll be
5176 * quick.
5177 */
5178 ocfs2_xattr_bucket_relse(xs->bucket);
4681 ret = ocfs2_xattr_index_block_find(inode, xs->xattr_bh, 5179 ret = ocfs2_xattr_index_block_find(inode, xs->xattr_bh,
4682 xi->name_index, 5180 xi->name_index,
4683 xi->name, xs); 5181 xi->name, xs);
@@ -4689,7 +5187,7 @@ try_again:
4689 } 5187 }
4690 5188
4691xattr_set: 5189xattr_set:
4692 ret = ocfs2_xattr_set_in_bucket(inode, xi, xs); 5190 ret = ocfs2_xattr_set_in_bucket(inode, xi, xs, ctxt);
4693out: 5191out:
4694 mlog_exit(ret); 5192 mlog_exit(ret);
4695 return ret; 5193 return ret;
@@ -4700,24 +5198,41 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
4700 void *para) 5198 void *para)
4701{ 5199{
4702 int ret = 0; 5200 int ret = 0;
4703 struct ocfs2_xattr_header *xh = bucket->xh; 5201 struct ocfs2_xattr_header *xh = bucket_xh(bucket);
4704 u16 i; 5202 u16 i;
4705 struct ocfs2_xattr_entry *xe; 5203 struct ocfs2_xattr_entry *xe;
5204 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
5205 struct ocfs2_xattr_set_ctxt ctxt = {NULL, NULL,};
5206 int credits = ocfs2_remove_extent_credits(osb->sb) +
5207 ocfs2_blocks_per_xattr_bucket(inode->i_sb);
5208
5209
5210 ocfs2_init_dealloc_ctxt(&ctxt.dealloc);
4706 5211
4707 for (i = 0; i < le16_to_cpu(xh->xh_count); i++) { 5212 for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
4708 xe = &xh->xh_entries[i]; 5213 xe = &xh->xh_entries[i];
4709 if (ocfs2_xattr_is_local(xe)) 5214 if (ocfs2_xattr_is_local(xe))
4710 continue; 5215 continue;
4711 5216
4712 ret = ocfs2_xattr_bucket_value_truncate(inode, 5217 ctxt.handle = ocfs2_start_trans(osb, credits);
4713 bucket->bhs[0], 5218 if (IS_ERR(ctxt.handle)) {
4714 i, 0); 5219 ret = PTR_ERR(ctxt.handle);
5220 mlog_errno(ret);
5221 break;
5222 }
5223
5224 ret = ocfs2_xattr_bucket_value_truncate(inode, bucket,
5225 i, 0, &ctxt);
5226
5227 ocfs2_commit_trans(osb, ctxt.handle);
4715 if (ret) { 5228 if (ret) {
4716 mlog_errno(ret); 5229 mlog_errno(ret);
4717 break; 5230 break;
4718 } 5231 }
4719 } 5232 }
4720 5233
5234 ocfs2_schedule_truncate_log_flush(osb, 1);
5235 ocfs2_run_deallocs(osb, &ctxt.dealloc);
4721 return ret; 5236 return ret;
4722} 5237}
4723 5238
@@ -4768,6 +5283,74 @@ out:
4768} 5283}
4769 5284
4770/* 5285/*
5286 * 'security' attributes support
5287 */
5288static size_t ocfs2_xattr_security_list(struct inode *inode, char *list,
5289 size_t list_size, const char *name,
5290 size_t name_len)
5291{
5292 const size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
5293 const size_t total_len = prefix_len + name_len + 1;
5294
5295 if (list && total_len <= list_size) {
5296 memcpy(list, XATTR_SECURITY_PREFIX, prefix_len);
5297 memcpy(list + prefix_len, name, name_len);
5298 list[prefix_len + name_len] = '\0';
5299 }
5300 return total_len;
5301}
5302
5303static int ocfs2_xattr_security_get(struct inode *inode, const char *name,
5304 void *buffer, size_t size)
5305{
5306 if (strcmp(name, "") == 0)
5307 return -EINVAL;
5308 return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_SECURITY, name,
5309 buffer, size);
5310}
5311
5312static int ocfs2_xattr_security_set(struct inode *inode, const char *name,
5313 const void *value, size_t size, int flags)
5314{
5315 if (strcmp(name, "") == 0)
5316 return -EINVAL;
5317
5318 return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY, name, value,
5319 size, flags);
5320}
5321
5322int ocfs2_init_security_get(struct inode *inode,
5323 struct inode *dir,
5324 struct ocfs2_security_xattr_info *si)
5325{
5326 /* check whether ocfs2 support feature xattr */
5327 if (!ocfs2_supports_xattr(OCFS2_SB(dir->i_sb)))
5328 return -EOPNOTSUPP;
5329 return security_inode_init_security(inode, dir, &si->name, &si->value,
5330 &si->value_len);
5331}
5332
5333int ocfs2_init_security_set(handle_t *handle,
5334 struct inode *inode,
5335 struct buffer_head *di_bh,
5336 struct ocfs2_security_xattr_info *si,
5337 struct ocfs2_alloc_context *xattr_ac,
5338 struct ocfs2_alloc_context *data_ac)
5339{
5340 return ocfs2_xattr_set_handle(handle, inode, di_bh,
5341 OCFS2_XATTR_INDEX_SECURITY,
5342 si->name, si->value, si->value_len, 0,
5343 xattr_ac, data_ac);
5344}
5345
5346struct xattr_handler ocfs2_xattr_security_handler = {
5347 .prefix = XATTR_SECURITY_PREFIX,
5348 .list = ocfs2_xattr_security_list,
5349 .get = ocfs2_xattr_security_get,
5350 .set = ocfs2_xattr_security_set,
5351};
5352
5353/*
4771 * 'trusted' attributes support 5354 * 'trusted' attributes support
4772 */ 5355 */
4773static size_t ocfs2_xattr_trusted_list(struct inode *inode, char *list, 5356static size_t ocfs2_xattr_trusted_list(struct inode *inode, char *list,
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index 1d8314c7656d..5a1ebc789f7e 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -30,13 +30,58 @@ enum ocfs2_xattr_type {
30 OCFS2_XATTR_MAX 30 OCFS2_XATTR_MAX
31}; 31};
32 32
33struct ocfs2_security_xattr_info {
34 int enable;
35 char *name;
36 void *value;
37 size_t value_len;
38};
39
33extern struct xattr_handler ocfs2_xattr_user_handler; 40extern struct xattr_handler ocfs2_xattr_user_handler;
34extern struct xattr_handler ocfs2_xattr_trusted_handler; 41extern struct xattr_handler ocfs2_xattr_trusted_handler;
42extern struct xattr_handler ocfs2_xattr_security_handler;
43#ifdef CONFIG_OCFS2_FS_POSIX_ACL
44extern struct xattr_handler ocfs2_xattr_acl_access_handler;
45extern struct xattr_handler ocfs2_xattr_acl_default_handler;
46#endif
35extern struct xattr_handler *ocfs2_xattr_handlers[]; 47extern struct xattr_handler *ocfs2_xattr_handlers[];
36 48
37ssize_t ocfs2_listxattr(struct dentry *, char *, size_t); 49ssize_t ocfs2_listxattr(struct dentry *, char *, size_t);
50int ocfs2_xattr_get_nolock(struct inode *, struct buffer_head *, int,
51 const char *, void *, size_t);
38int ocfs2_xattr_set(struct inode *, int, const char *, const void *, 52int ocfs2_xattr_set(struct inode *, int, const char *, const void *,
39 size_t, int); 53 size_t, int);
54int ocfs2_xattr_set_handle(handle_t *, struct inode *, struct buffer_head *,
55 int, const char *, const void *, size_t, int,
56 struct ocfs2_alloc_context *,
57 struct ocfs2_alloc_context *);
40int ocfs2_xattr_remove(struct inode *, struct buffer_head *); 58int ocfs2_xattr_remove(struct inode *, struct buffer_head *);
59int ocfs2_init_security_get(struct inode *, struct inode *,
60 struct ocfs2_security_xattr_info *);
61int ocfs2_init_security_set(handle_t *, struct inode *,
62 struct buffer_head *,
63 struct ocfs2_security_xattr_info *,
64 struct ocfs2_alloc_context *,
65 struct ocfs2_alloc_context *);
66int ocfs2_calc_security_init(struct inode *,
67 struct ocfs2_security_xattr_info *,
68 int *, int *, struct ocfs2_alloc_context **);
69int ocfs2_calc_xattr_init(struct inode *, struct buffer_head *,
70 int, struct ocfs2_security_xattr_info *,
71 int *, int *, struct ocfs2_alloc_context **);
72
73/*
74 * xattrs can live inside an inode, as part of an external xattr block,
75 * or inside an xattr bucket, which is the leaf of a tree rooted in an
76 * xattr block. Some of the xattr calls, especially the value setting
77 * functions, want to treat each of these locations as equal. Let's wrap
78 * them in a structure that we can pass around instead of raw buffer_heads.
79 */
80struct ocfs2_xattr_value_buf {
81 struct buffer_head *vb_bh;
82 ocfs2_journal_access_func vb_access;
83 struct ocfs2_xattr_value_root *vb_xv;
84};
85
41 86
42#endif /* OCFS2_XATTR_H */ 87#endif /* OCFS2_XATTR_H */
diff --git a/fs/omfs/Kconfig b/fs/omfs/Kconfig
new file mode 100644
index 000000000000..b1b9a0aba6fd
--- /dev/null
+++ b/fs/omfs/Kconfig
@@ -0,0 +1,13 @@
1config OMFS_FS
2 tristate "SonicBlue Optimized MPEG File System support"
3 depends on BLOCK
4 select CRC_ITU_T
5 help
6 This is the proprietary file system used by the Rio Karma music
7 player and ReplayTV DVR. Despite the name, this filesystem is not
8 more efficient than a standard FS for MPEG files, in fact likely
9 the opposite is true. Say Y if you have either of these devices
10 and wish to mount its disk.
11
12 To compile this file system support as a module, choose M here: the
13 module will be called omfs. If unsure, say N.
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index 6afe57c84f84..633e9dc972bb 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -39,7 +39,6 @@ struct inode *omfs_new_inode(struct inode *dir, int mode)
39 inode->i_mode = mode; 39 inode->i_mode = mode;
40 inode->i_uid = current_fsuid(); 40 inode->i_uid = current_fsuid();
41 inode->i_gid = current_fsgid(); 41 inode->i_gid = current_fsgid();
42 inode->i_blocks = 0;
43 inode->i_mapping->a_ops = &omfs_aops; 42 inode->i_mapping->a_ops = &omfs_aops;
44 43
45 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 44 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/open.c b/fs/open.c
index c0a426d5766c..a3a78ceb2a2b 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -122,7 +122,7 @@ static int vfs_statfs64(struct dentry *dentry, struct statfs64 *buf)
122 return 0; 122 return 0;
123} 123}
124 124
125asmlinkage long sys_statfs(const char __user *pathname, struct statfs __user * buf) 125SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct statfs __user *, buf)
126{ 126{
127 struct path path; 127 struct path path;
128 int error; 128 int error;
@@ -138,8 +138,7 @@ asmlinkage long sys_statfs(const char __user *pathname, struct statfs __user * b
138 return error; 138 return error;
139} 139}
140 140
141 141SYSCALL_DEFINE3(statfs64, const char __user *, pathname, size_t, sz, struct statfs64 __user *, buf)
142asmlinkage long sys_statfs64(const char __user *pathname, size_t sz, struct statfs64 __user *buf)
143{ 142{
144 struct path path; 143 struct path path;
145 long error; 144 long error;
@@ -157,8 +156,7 @@ asmlinkage long sys_statfs64(const char __user *pathname, size_t sz, struct stat
157 return error; 156 return error;
158} 157}
159 158
160 159SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct statfs __user *, buf)
161asmlinkage long sys_fstatfs(unsigned int fd, struct statfs __user * buf)
162{ 160{
163 struct file * file; 161 struct file * file;
164 struct statfs tmp; 162 struct statfs tmp;
@@ -176,7 +174,7 @@ out:
176 return error; 174 return error;
177} 175}
178 176
179asmlinkage long sys_fstatfs64(unsigned int fd, size_t sz, struct statfs64 __user *buf) 177SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user *, buf)
180{ 178{
181 struct file * file; 179 struct file * file;
182 struct statfs64 tmp; 180 struct statfs64 tmp;
@@ -272,6 +270,8 @@ static long do_sys_truncate(const char __user *pathname, loff_t length)
272 goto put_write_and_out; 270 goto put_write_and_out;
273 271
274 error = locks_verify_truncate(inode, NULL, length); 272 error = locks_verify_truncate(inode, NULL, length);
273 if (!error)
274 error = security_path_truncate(&path, length, 0);
275 if (!error) { 275 if (!error) {
276 DQUOT_INIT(inode); 276 DQUOT_INIT(inode);
277 error = do_truncate(path.dentry, length, 0, NULL); 277 error = do_truncate(path.dentry, length, 0, NULL);
@@ -287,7 +287,7 @@ out:
287 return error; 287 return error;
288} 288}
289 289
290asmlinkage long sys_truncate(const char __user * path, unsigned long length) 290SYSCALL_DEFINE2(truncate, const char __user *, path, unsigned long, length)
291{ 291{
292 /* on 32-bit boxen it will cut the range 2^31--2^32-1 off */ 292 /* on 32-bit boxen it will cut the range 2^31--2^32-1 off */
293 return do_sys_truncate(path, (long)length); 293 return do_sys_truncate(path, (long)length);
@@ -329,6 +329,9 @@ static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
329 329
330 error = locks_verify_truncate(inode, file, length); 330 error = locks_verify_truncate(inode, file, length);
331 if (!error) 331 if (!error)
332 error = security_path_truncate(&file->f_path, length,
333 ATTR_MTIME|ATTR_CTIME);
334 if (!error)
332 error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, file); 335 error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, file);
333out_putf: 336out_putf:
334 fput(file); 337 fput(file);
@@ -336,7 +339,7 @@ out:
336 return error; 339 return error;
337} 340}
338 341
339asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length) 342SYSCALL_DEFINE2(ftruncate, unsigned int, fd, unsigned long, length)
340{ 343{
341 long ret = do_sys_ftruncate(fd, length, 1); 344 long ret = do_sys_ftruncate(fd, length, 1);
342 /* avoid REGPARM breakage on x86: */ 345 /* avoid REGPARM breakage on x86: */
@@ -346,21 +349,35 @@ asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length)
346 349
347/* LFS versions of truncate are only needed on 32 bit machines */ 350/* LFS versions of truncate are only needed on 32 bit machines */
348#if BITS_PER_LONG == 32 351#if BITS_PER_LONG == 32
349asmlinkage long sys_truncate64(const char __user * path, loff_t length) 352SYSCALL_DEFINE(truncate64)(const char __user * path, loff_t length)
350{ 353{
351 return do_sys_truncate(path, length); 354 return do_sys_truncate(path, length);
352} 355}
356#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
357asmlinkage long SyS_truncate64(long path, loff_t length)
358{
359 return SYSC_truncate64((const char __user *) path, length);
360}
361SYSCALL_ALIAS(sys_truncate64, SyS_truncate64);
362#endif
353 363
354asmlinkage long sys_ftruncate64(unsigned int fd, loff_t length) 364SYSCALL_DEFINE(ftruncate64)(unsigned int fd, loff_t length)
355{ 365{
356 long ret = do_sys_ftruncate(fd, length, 0); 366 long ret = do_sys_ftruncate(fd, length, 0);
357 /* avoid REGPARM breakage on x86: */ 367 /* avoid REGPARM breakage on x86: */
358 asmlinkage_protect(2, ret, fd, length); 368 asmlinkage_protect(2, ret, fd, length);
359 return ret; 369 return ret;
360} 370}
371#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
372asmlinkage long SyS_ftruncate64(long fd, loff_t length)
373{
374 return SYSC_ftruncate64((unsigned int) fd, length);
375}
376SYSCALL_ALIAS(sys_ftruncate64, SyS_ftruncate64);
361#endif 377#endif
378#endif /* BITS_PER_LONG == 32 */
362 379
363asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len) 380SYSCALL_DEFINE(fallocate)(int fd, int mode, loff_t offset, loff_t len)
364{ 381{
365 struct file *file; 382 struct file *file;
366 struct inode *inode; 383 struct inode *inode;
@@ -407,7 +424,7 @@ asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len)
407 if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0)) 424 if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0))
408 goto out_fput; 425 goto out_fput;
409 426
410 if (inode->i_op && inode->i_op->fallocate) 427 if (inode->i_op->fallocate)
411 ret = inode->i_op->fallocate(inode, mode, offset, len); 428 ret = inode->i_op->fallocate(inode, mode, offset, len);
412 else 429 else
413 ret = -EOPNOTSUPP; 430 ret = -EOPNOTSUPP;
@@ -417,13 +434,20 @@ out_fput:
417out: 434out:
418 return ret; 435 return ret;
419} 436}
437#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
438asmlinkage long SyS_fallocate(long fd, long mode, loff_t offset, loff_t len)
439{
440 return SYSC_fallocate((int)fd, (int)mode, offset, len);
441}
442SYSCALL_ALIAS(sys_fallocate, SyS_fallocate);
443#endif
420 444
421/* 445/*
422 * access() needs to use the real uid/gid, not the effective uid/gid. 446 * access() needs to use the real uid/gid, not the effective uid/gid.
423 * We do this by temporarily clearing all FS-related capabilities and 447 * We do this by temporarily clearing all FS-related capabilities and
424 * switching the fsuid/fsgid around to the real ones. 448 * switching the fsuid/fsgid around to the real ones.
425 */ 449 */
426asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode) 450SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)
427{ 451{
428 const struct cred *old_cred; 452 const struct cred *old_cred;
429 struct cred *override_cred; 453 struct cred *override_cred;
@@ -493,12 +517,12 @@ out:
493 return res; 517 return res;
494} 518}
495 519
496asmlinkage long sys_access(const char __user *filename, int mode) 520SYSCALL_DEFINE2(access, const char __user *, filename, int, mode)
497{ 521{
498 return sys_faccessat(AT_FDCWD, filename, mode); 522 return sys_faccessat(AT_FDCWD, filename, mode);
499} 523}
500 524
501asmlinkage long sys_chdir(const char __user * filename) 525SYSCALL_DEFINE1(chdir, const char __user *, filename)
502{ 526{
503 struct path path; 527 struct path path;
504 int error; 528 int error;
@@ -519,7 +543,7 @@ out:
519 return error; 543 return error;
520} 544}
521 545
522asmlinkage long sys_fchdir(unsigned int fd) 546SYSCALL_DEFINE1(fchdir, unsigned int, fd)
523{ 547{
524 struct file *file; 548 struct file *file;
525 struct inode *inode; 549 struct inode *inode;
@@ -545,7 +569,7 @@ out:
545 return error; 569 return error;
546} 570}
547 571
548asmlinkage long sys_chroot(const char __user * filename) 572SYSCALL_DEFINE1(chroot, const char __user *, filename)
549{ 573{
550 struct path path; 574 struct path path;
551 int error; 575 int error;
@@ -570,7 +594,7 @@ out:
570 return error; 594 return error;
571} 595}
572 596
573asmlinkage long sys_fchmod(unsigned int fd, mode_t mode) 597SYSCALL_DEFINE2(fchmod, unsigned int, fd, mode_t, mode)
574{ 598{
575 struct inode * inode; 599 struct inode * inode;
576 struct dentry * dentry; 600 struct dentry * dentry;
@@ -604,8 +628,7 @@ out:
604 return err; 628 return err;
605} 629}
606 630
607asmlinkage long sys_fchmodat(int dfd, const char __user *filename, 631SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, mode_t, mode)
608 mode_t mode)
609{ 632{
610 struct path path; 633 struct path path;
611 struct inode *inode; 634 struct inode *inode;
@@ -634,7 +657,7 @@ out:
634 return error; 657 return error;
635} 658}
636 659
637asmlinkage long sys_chmod(const char __user *filename, mode_t mode) 660SYSCALL_DEFINE2(chmod, const char __user *, filename, mode_t, mode)
638{ 661{
639 return sys_fchmodat(AT_FDCWD, filename, mode); 662 return sys_fchmodat(AT_FDCWD, filename, mode);
640} 663}
@@ -664,7 +687,7 @@ static int chown_common(struct dentry * dentry, uid_t user, gid_t group)
664 return error; 687 return error;
665} 688}
666 689
667asmlinkage long sys_chown(const char __user * filename, uid_t user, gid_t group) 690SYSCALL_DEFINE3(chown, const char __user *, filename, uid_t, user, gid_t, group)
668{ 691{
669 struct path path; 692 struct path path;
670 int error; 693 int error;
@@ -683,8 +706,8 @@ out:
683 return error; 706 return error;
684} 707}
685 708
686asmlinkage long sys_fchownat(int dfd, const char __user *filename, uid_t user, 709SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user,
687 gid_t group, int flag) 710 gid_t, group, int, flag)
688{ 711{
689 struct path path; 712 struct path path;
690 int error = -EINVAL; 713 int error = -EINVAL;
@@ -708,7 +731,7 @@ out:
708 return error; 731 return error;
709} 732}
710 733
711asmlinkage long sys_lchown(const char __user * filename, uid_t user, gid_t group) 734SYSCALL_DEFINE3(lchown, const char __user *, filename, uid_t, user, gid_t, group)
712{ 735{
713 struct path path; 736 struct path path;
714 int error; 737 int error;
@@ -727,8 +750,7 @@ out:
727 return error; 750 return error;
728} 751}
729 752
730 753SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group)
731asmlinkage long sys_fchown(unsigned int fd, uid_t user, gid_t group)
732{ 754{
733 struct file * file; 755 struct file * file;
734 int error = -EBADF; 756 int error = -EBADF;
@@ -1024,7 +1046,7 @@ long do_sys_open(int dfd, const char __user *filename, int flags, int mode)
1024 return fd; 1046 return fd;
1025} 1047}
1026 1048
1027asmlinkage long sys_open(const char __user *filename, int flags, int mode) 1049SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, int, mode)
1028{ 1050{
1029 long ret; 1051 long ret;
1030 1052
@@ -1037,8 +1059,8 @@ asmlinkage long sys_open(const char __user *filename, int flags, int mode)
1037 return ret; 1059 return ret;
1038} 1060}
1039 1061
1040asmlinkage long sys_openat(int dfd, const char __user *filename, int flags, 1062SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags,
1041 int mode) 1063 int, mode)
1042{ 1064{
1043 long ret; 1065 long ret;
1044 1066
@@ -1057,7 +1079,7 @@ asmlinkage long sys_openat(int dfd, const char __user *filename, int flags,
1057 * For backward compatibility? Maybe this should be moved 1079 * For backward compatibility? Maybe this should be moved
1058 * into arch/i386 instead? 1080 * into arch/i386 instead?
1059 */ 1081 */
1060asmlinkage long sys_creat(const char __user * pathname, int mode) 1082SYSCALL_DEFINE2(creat, const char __user *, pathname, int, mode)
1061{ 1083{
1062 return sys_open(pathname, O_CREAT | O_WRONLY | O_TRUNC, mode); 1084 return sys_open(pathname, O_CREAT | O_WRONLY | O_TRUNC, mode);
1063} 1085}
@@ -1093,7 +1115,7 @@ EXPORT_SYMBOL(filp_close);
1093 * releasing the fd. This ensures that one clone task can't release 1115 * releasing the fd. This ensures that one clone task can't release
1094 * an fd while another clone is opening it. 1116 * an fd while another clone is opening it.
1095 */ 1117 */
1096asmlinkage long sys_close(unsigned int fd) 1118SYSCALL_DEFINE1(close, unsigned int, fd)
1097{ 1119{
1098 struct file * filp; 1120 struct file * filp;
1099 struct files_struct *files = current->files; 1121 struct files_struct *files = current->files;
@@ -1126,14 +1148,13 @@ out_unlock:
1126 spin_unlock(&files->file_lock); 1148 spin_unlock(&files->file_lock);
1127 return -EBADF; 1149 return -EBADF;
1128} 1150}
1129
1130EXPORT_SYMBOL(sys_close); 1151EXPORT_SYMBOL(sys_close);
1131 1152
1132/* 1153/*
1133 * This routine simulates a hangup on the tty, to arrange that users 1154 * This routine simulates a hangup on the tty, to arrange that users
1134 * are given clean terminals at login time. 1155 * are given clean terminals at login time.
1135 */ 1156 */
1136asmlinkage long sys_vhangup(void) 1157SYSCALL_DEFINE0(vhangup)
1137{ 1158{
1138 if (capable(CAP_SYS_TTY_CONFIG)) { 1159 if (capable(CAP_SYS_TTY_CONFIG)) {
1139 tty_vhangup_self(); 1160 tty_vhangup_self();
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index d41bdc784de4..ffcd04f0012c 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -256,9 +256,6 @@ found:
256 break; 256 break;
257 } 257 }
258 258
259 inode->i_gid = 0;
260 inode->i_uid = 0;
261
262 d_add(dentry, inode); 259 d_add(dentry, inode);
263 return NULL; 260 return NULL;
264} 261}
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 6d5b213b8a9b..6d720243f5f4 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -334,6 +334,7 @@ void delete_partition(struct gendisk *disk, int partno)
334 334
335 blk_free_devt(part_devt(part)); 335 blk_free_devt(part_devt(part));
336 rcu_assign_pointer(ptbl->part[partno], NULL); 336 rcu_assign_pointer(ptbl->part[partno], NULL);
337 rcu_assign_pointer(ptbl->last_lookup, NULL);
337 kobject_put(part->holder_dir); 338 kobject_put(part->holder_dir);
338 device_del(part_to_dev(part)); 339 device_del(part_to_dev(part));
339 340
@@ -384,9 +385,9 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
384 385
385 dname = dev_name(ddev); 386 dname = dev_name(ddev);
386 if (isdigit(dname[strlen(dname) - 1])) 387 if (isdigit(dname[strlen(dname) - 1]))
387 snprintf(pdev->bus_id, BUS_ID_SIZE, "%sp%d", dname, partno); 388 dev_set_name(pdev, "%sp%d", dname, partno);
388 else 389 else
389 snprintf(pdev->bus_id, BUS_ID_SIZE, "%s%d", dname, partno); 390 dev_set_name(pdev, "%s%d", dname, partno);
390 391
391 device_initialize(pdev); 392 device_initialize(pdev);
392 pdev->class = &block_class; 393 pdev->class = &block_class;
@@ -447,16 +448,11 @@ void register_disk(struct gendisk *disk)
447 struct block_device *bdev; 448 struct block_device *bdev;
448 struct disk_part_iter piter; 449 struct disk_part_iter piter;
449 struct hd_struct *part; 450 struct hd_struct *part;
450 char *s;
451 int err; 451 int err;
452 452
453 ddev->parent = disk->driverfs_dev; 453 ddev->parent = disk->driverfs_dev;
454 454
455 strlcpy(ddev->bus_id, disk->disk_name, BUS_ID_SIZE); 455 dev_set_name(ddev, disk->disk_name);
456 /* ewww... some of these buggers have / in the name... */
457 s = strchr(ddev->bus_id, '/');
458 if (s)
459 *s = '!';
460 456
461 /* delay uevents, until we scanned partition table */ 457 /* delay uevents, until we scanned partition table */
462 ddev->uevent_suppress = 1; 458 ddev->uevent_suppress = 1;
diff --git a/fs/pipe.c b/fs/pipe.c
index aaf797bd57b9..3a48ba5179d5 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1016,10 +1016,7 @@ int do_pipe_flags(int *fd, int flags)
1016 goto err_fdr; 1016 goto err_fdr;
1017 fdw = error; 1017 fdw = error;
1018 1018
1019 error = audit_fd_pair(fdr, fdw); 1019 audit_fd_pair(fdr, fdw);
1020 if (error < 0)
1021 goto err_fdw;
1022
1023 fd_install(fdr, fr); 1020 fd_install(fdr, fr);
1024 fd_install(fdw, fw); 1021 fd_install(fdw, fw);
1025 fd[0] = fdr; 1022 fd[0] = fdr;
@@ -1027,8 +1024,6 @@ int do_pipe_flags(int *fd, int flags)
1027 1024
1028 return 0; 1025 return 0;
1029 1026
1030 err_fdw:
1031 put_unused_fd(fdw);
1032 err_fdr: 1027 err_fdr:
1033 put_unused_fd(fdr); 1028 put_unused_fd(fdr);
1034 err_read_pipe: 1029 err_read_pipe:
@@ -1048,7 +1043,7 @@ int do_pipe(int *fd)
1048 * sys_pipe() is the normal C calling standard for creating 1043 * sys_pipe() is the normal C calling standard for creating
1049 * a pipe. It's not the way Unix traditionally does this, though. 1044 * a pipe. It's not the way Unix traditionally does this, though.
1050 */ 1045 */
1051asmlinkage long __weak sys_pipe2(int __user *fildes, int flags) 1046SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags)
1052{ 1047{
1053 int fd[2]; 1048 int fd[2];
1054 int error; 1049 int error;
@@ -1064,7 +1059,7 @@ asmlinkage long __weak sys_pipe2(int __user *fildes, int flags)
1064 return error; 1059 return error;
1065} 1060}
1066 1061
1067asmlinkage long __weak sys_pipe(int __user *fildes) 1062SYSCALL_DEFINE1(pipe, int __user *, fildes)
1068{ 1063{
1069 return sys_pipe2(fildes, 0); 1064 return sys_pipe2(fildes, 0);
1070} 1065}
diff --git a/fs/proc/base.c b/fs/proc/base.c
index cad92c1ac2b3..0c9de19a1633 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -65,6 +65,7 @@
65#include <linux/mm.h> 65#include <linux/mm.h>
66#include <linux/rcupdate.h> 66#include <linux/rcupdate.h>
67#include <linux/kallsyms.h> 67#include <linux/kallsyms.h>
68#include <linux/stacktrace.h>
68#include <linux/resource.h> 69#include <linux/resource.h>
69#include <linux/module.h> 70#include <linux/module.h>
70#include <linux/mount.h> 71#include <linux/mount.h>
@@ -109,25 +110,22 @@ struct pid_entry {
109 .op = OP, \ 110 .op = OP, \
110} 111}
111 112
112#define DIR(NAME, MODE, OTYPE) \ 113#define DIR(NAME, MODE, iops, fops) \
113 NOD(NAME, (S_IFDIR|(MODE)), \ 114 NOD(NAME, (S_IFDIR|(MODE)), &iops, &fops, {} )
114 &proc_##OTYPE##_inode_operations, &proc_##OTYPE##_operations, \ 115#define LNK(NAME, get_link) \
115 {} )
116#define LNK(NAME, OTYPE) \
117 NOD(NAME, (S_IFLNK|S_IRWXUGO), \ 116 NOD(NAME, (S_IFLNK|S_IRWXUGO), \
118 &proc_pid_link_inode_operations, NULL, \ 117 &proc_pid_link_inode_operations, NULL, \
119 { .proc_get_link = &proc_##OTYPE##_link } ) 118 { .proc_get_link = get_link } )
120#define REG(NAME, MODE, OTYPE) \ 119#define REG(NAME, MODE, fops) \
121 NOD(NAME, (S_IFREG|(MODE)), NULL, \ 120 NOD(NAME, (S_IFREG|(MODE)), NULL, &fops, {})
122 &proc_##OTYPE##_operations, {}) 121#define INF(NAME, MODE, read) \
123#define INF(NAME, MODE, OTYPE) \
124 NOD(NAME, (S_IFREG|(MODE)), \ 122 NOD(NAME, (S_IFREG|(MODE)), \
125 NULL, &proc_info_file_operations, \ 123 NULL, &proc_info_file_operations, \
126 { .proc_read = &proc_##OTYPE } ) 124 { .proc_read = read } )
127#define ONE(NAME, MODE, OTYPE) \ 125#define ONE(NAME, MODE, show) \
128 NOD(NAME, (S_IFREG|(MODE)), \ 126 NOD(NAME, (S_IFREG|(MODE)), \
129 NULL, &proc_single_file_operations, \ 127 NULL, &proc_single_file_operations, \
130 { .proc_show = &proc_##OTYPE } ) 128 { .proc_show = show } )
131 129
132/* 130/*
133 * Count the number of hardlinks for the pid_entry table, excluding the . 131 * Count the number of hardlinks for the pid_entry table, excluding the .
@@ -308,9 +306,9 @@ static int proc_pid_auxv(struct task_struct *task, char *buffer)
308 struct mm_struct *mm = get_task_mm(task); 306 struct mm_struct *mm = get_task_mm(task);
309 if (mm) { 307 if (mm) {
310 unsigned int nwords = 0; 308 unsigned int nwords = 0;
311 do 309 do {
312 nwords += 2; 310 nwords += 2;
313 while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */ 311 } while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */
314 res = nwords * sizeof(mm->saved_auxv[0]); 312 res = nwords * sizeof(mm->saved_auxv[0]);
315 if (res > PAGE_SIZE) 313 if (res > PAGE_SIZE)
316 res = PAGE_SIZE; 314 res = PAGE_SIZE;
@@ -340,6 +338,37 @@ static int proc_pid_wchan(struct task_struct *task, char *buffer)
340} 338}
341#endif /* CONFIG_KALLSYMS */ 339#endif /* CONFIG_KALLSYMS */
342 340
341#ifdef CONFIG_STACKTRACE
342
343#define MAX_STACK_TRACE_DEPTH 64
344
345static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
346 struct pid *pid, struct task_struct *task)
347{
348 struct stack_trace trace;
349 unsigned long *entries;
350 int i;
351
352 entries = kmalloc(MAX_STACK_TRACE_DEPTH * sizeof(*entries), GFP_KERNEL);
353 if (!entries)
354 return -ENOMEM;
355
356 trace.nr_entries = 0;
357 trace.max_entries = MAX_STACK_TRACE_DEPTH;
358 trace.entries = entries;
359 trace.skip = 0;
360 save_stack_trace_tsk(task, &trace);
361
362 for (i = 0; i < trace.nr_entries; i++) {
363 seq_printf(m, "[<%p>] %pS\n",
364 (void *)entries[i], (void *)entries[i]);
365 }
366 kfree(entries);
367
368 return 0;
369}
370#endif
371
343#ifdef CONFIG_SCHEDSTATS 372#ifdef CONFIG_SCHEDSTATS
344/* 373/*
345 * Provides /proc/PID/schedstat 374 * Provides /proc/PID/schedstat
@@ -1186,8 +1215,6 @@ static int sched_show(struct seq_file *m, void *v)
1186 struct inode *inode = m->private; 1215 struct inode *inode = m->private;
1187 struct task_struct *p; 1216 struct task_struct *p;
1188 1217
1189 WARN_ON(!inode);
1190
1191 p = get_proc_task(inode); 1218 p = get_proc_task(inode);
1192 if (!p) 1219 if (!p)
1193 return -ESRCH; 1220 return -ESRCH;
@@ -1205,8 +1232,6 @@ sched_write(struct file *file, const char __user *buf,
1205 struct inode *inode = file->f_path.dentry->d_inode; 1232 struct inode *inode = file->f_path.dentry->d_inode;
1206 struct task_struct *p; 1233 struct task_struct *p;
1207 1234
1208 WARN_ON(!inode);
1209
1210 p = get_proc_task(inode); 1235 p = get_proc_task(inode);
1211 if (!p) 1236 if (!p)
1212 return -ESRCH; 1237 return -ESRCH;
@@ -1426,8 +1451,6 @@ static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_st
1426 if (!ei->pid) 1451 if (!ei->pid)
1427 goto out_unlock; 1452 goto out_unlock;
1428 1453
1429 inode->i_uid = 0;
1430 inode->i_gid = 0;
1431 if (task_dumpable(task)) { 1454 if (task_dumpable(task)) {
1432 rcu_read_lock(); 1455 rcu_read_lock();
1433 cred = __task_cred(task); 1456 cred = __task_cred(task);
@@ -1976,13 +1999,11 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
1976 const struct pid_entry *ents, 1999 const struct pid_entry *ents,
1977 unsigned int nents) 2000 unsigned int nents)
1978{ 2001{
1979 struct inode *inode;
1980 struct dentry *error; 2002 struct dentry *error;
1981 struct task_struct *task = get_proc_task(dir); 2003 struct task_struct *task = get_proc_task(dir);
1982 const struct pid_entry *p, *last; 2004 const struct pid_entry *p, *last;
1983 2005
1984 error = ERR_PTR(-ENOENT); 2006 error = ERR_PTR(-ENOENT);
1985 inode = NULL;
1986 2007
1987 if (!task) 2008 if (!task)
1988 goto out_no_task; 2009 goto out_no_task;
@@ -2138,12 +2159,12 @@ static const struct file_operations proc_pid_attr_operations = {
2138}; 2159};
2139 2160
2140static const struct pid_entry attr_dir_stuff[] = { 2161static const struct pid_entry attr_dir_stuff[] = {
2141 REG("current", S_IRUGO|S_IWUGO, pid_attr), 2162 REG("current", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
2142 REG("prev", S_IRUGO, pid_attr), 2163 REG("prev", S_IRUGO, proc_pid_attr_operations),
2143 REG("exec", S_IRUGO|S_IWUGO, pid_attr), 2164 REG("exec", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
2144 REG("fscreate", S_IRUGO|S_IWUGO, pid_attr), 2165 REG("fscreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
2145 REG("keycreate", S_IRUGO|S_IWUGO, pid_attr), 2166 REG("keycreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
2146 REG("sockcreate", S_IRUGO|S_IWUGO, pid_attr), 2167 REG("sockcreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
2147}; 2168};
2148 2169
2149static int proc_attr_dir_readdir(struct file * filp, 2170static int proc_attr_dir_readdir(struct file * filp,
@@ -2349,8 +2370,6 @@ static struct dentry *proc_base_instantiate(struct inode *dir,
2349 if (!ei->pid) 2370 if (!ei->pid)
2350 goto out_iput; 2371 goto out_iput;
2351 2372
2352 inode->i_uid = 0;
2353 inode->i_gid = 0;
2354 inode->i_mode = p->mode; 2373 inode->i_mode = p->mode;
2355 if (S_ISDIR(inode->i_mode)) 2374 if (S_ISDIR(inode->i_mode))
2356 inode->i_nlink = 2; 2375 inode->i_nlink = 2;
@@ -2465,74 +2484,77 @@ static const struct file_operations proc_task_operations;
2465static const struct inode_operations proc_task_inode_operations; 2484static const struct inode_operations proc_task_inode_operations;
2466 2485
2467static const struct pid_entry tgid_base_stuff[] = { 2486static const struct pid_entry tgid_base_stuff[] = {
2468 DIR("task", S_IRUGO|S_IXUGO, task), 2487 DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
2469 DIR("fd", S_IRUSR|S_IXUSR, fd), 2488 DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
2470 DIR("fdinfo", S_IRUSR|S_IXUSR, fdinfo), 2489 DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
2471#ifdef CONFIG_NET 2490#ifdef CONFIG_NET
2472 DIR("net", S_IRUGO|S_IXUGO, net), 2491 DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
2473#endif 2492#endif
2474 REG("environ", S_IRUSR, environ), 2493 REG("environ", S_IRUSR, proc_environ_operations),
2475 INF("auxv", S_IRUSR, pid_auxv), 2494 INF("auxv", S_IRUSR, proc_pid_auxv),
2476 ONE("status", S_IRUGO, pid_status), 2495 ONE("status", S_IRUGO, proc_pid_status),
2477 ONE("personality", S_IRUSR, pid_personality), 2496 ONE("personality", S_IRUSR, proc_pid_personality),
2478 INF("limits", S_IRUSR, pid_limits), 2497 INF("limits", S_IRUSR, proc_pid_limits),
2479#ifdef CONFIG_SCHED_DEBUG 2498#ifdef CONFIG_SCHED_DEBUG
2480 REG("sched", S_IRUGO|S_IWUSR, pid_sched), 2499 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
2481#endif 2500#endif
2482#ifdef CONFIG_HAVE_ARCH_TRACEHOOK 2501#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
2483 INF("syscall", S_IRUSR, pid_syscall), 2502 INF("syscall", S_IRUSR, proc_pid_syscall),
2484#endif 2503#endif
2485 INF("cmdline", S_IRUGO, pid_cmdline), 2504 INF("cmdline", S_IRUGO, proc_pid_cmdline),
2486 ONE("stat", S_IRUGO, tgid_stat), 2505 ONE("stat", S_IRUGO, proc_tgid_stat),
2487 ONE("statm", S_IRUGO, pid_statm), 2506 ONE("statm", S_IRUGO, proc_pid_statm),
2488 REG("maps", S_IRUGO, maps), 2507 REG("maps", S_IRUGO, proc_maps_operations),
2489#ifdef CONFIG_NUMA 2508#ifdef CONFIG_NUMA
2490 REG("numa_maps", S_IRUGO, numa_maps), 2509 REG("numa_maps", S_IRUGO, proc_numa_maps_operations),
2491#endif 2510#endif
2492 REG("mem", S_IRUSR|S_IWUSR, mem), 2511 REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations),
2493 LNK("cwd", cwd), 2512 LNK("cwd", proc_cwd_link),
2494 LNK("root", root), 2513 LNK("root", proc_root_link),
2495 LNK("exe", exe), 2514 LNK("exe", proc_exe_link),
2496 REG("mounts", S_IRUGO, mounts), 2515 REG("mounts", S_IRUGO, proc_mounts_operations),
2497 REG("mountinfo", S_IRUGO, mountinfo), 2516 REG("mountinfo", S_IRUGO, proc_mountinfo_operations),
2498 REG("mountstats", S_IRUSR, mountstats), 2517 REG("mountstats", S_IRUSR, proc_mountstats_operations),
2499#ifdef CONFIG_PROC_PAGE_MONITOR 2518#ifdef CONFIG_PROC_PAGE_MONITOR
2500 REG("clear_refs", S_IWUSR, clear_refs), 2519 REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
2501 REG("smaps", S_IRUGO, smaps), 2520 REG("smaps", S_IRUGO, proc_smaps_operations),
2502 REG("pagemap", S_IRUSR, pagemap), 2521 REG("pagemap", S_IRUSR, proc_pagemap_operations),
2503#endif 2522#endif
2504#ifdef CONFIG_SECURITY 2523#ifdef CONFIG_SECURITY
2505 DIR("attr", S_IRUGO|S_IXUGO, attr_dir), 2524 DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
2506#endif 2525#endif
2507#ifdef CONFIG_KALLSYMS 2526#ifdef CONFIG_KALLSYMS
2508 INF("wchan", S_IRUGO, pid_wchan), 2527 INF("wchan", S_IRUGO, proc_pid_wchan),
2528#endif
2529#ifdef CONFIG_STACKTRACE
2530 ONE("stack", S_IRUSR, proc_pid_stack),
2509#endif 2531#endif
2510#ifdef CONFIG_SCHEDSTATS 2532#ifdef CONFIG_SCHEDSTATS
2511 INF("schedstat", S_IRUGO, pid_schedstat), 2533 INF("schedstat", S_IRUGO, proc_pid_schedstat),
2512#endif 2534#endif
2513#ifdef CONFIG_LATENCYTOP 2535#ifdef CONFIG_LATENCYTOP
2514 REG("latency", S_IRUGO, lstats), 2536 REG("latency", S_IRUGO, proc_lstats_operations),
2515#endif 2537#endif
2516#ifdef CONFIG_PROC_PID_CPUSET 2538#ifdef CONFIG_PROC_PID_CPUSET
2517 REG("cpuset", S_IRUGO, cpuset), 2539 REG("cpuset", S_IRUGO, proc_cpuset_operations),
2518#endif 2540#endif
2519#ifdef CONFIG_CGROUPS 2541#ifdef CONFIG_CGROUPS
2520 REG("cgroup", S_IRUGO, cgroup), 2542 REG("cgroup", S_IRUGO, proc_cgroup_operations),
2521#endif 2543#endif
2522 INF("oom_score", S_IRUGO, oom_score), 2544 INF("oom_score", S_IRUGO, proc_oom_score),
2523 REG("oom_adj", S_IRUGO|S_IWUSR, oom_adjust), 2545 REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adjust_operations),
2524#ifdef CONFIG_AUDITSYSCALL 2546#ifdef CONFIG_AUDITSYSCALL
2525 REG("loginuid", S_IWUSR|S_IRUGO, loginuid), 2547 REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations),
2526 REG("sessionid", S_IRUGO, sessionid), 2548 REG("sessionid", S_IRUGO, proc_sessionid_operations),
2527#endif 2549#endif
2528#ifdef CONFIG_FAULT_INJECTION 2550#ifdef CONFIG_FAULT_INJECTION
2529 REG("make-it-fail", S_IRUGO|S_IWUSR, fault_inject), 2551 REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
2530#endif 2552#endif
2531#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE) 2553#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE)
2532 REG("coredump_filter", S_IRUGO|S_IWUSR, coredump_filter), 2554 REG("coredump_filter", S_IRUGO|S_IWUSR, proc_coredump_filter_operations),
2533#endif 2555#endif
2534#ifdef CONFIG_TASK_IO_ACCOUNTING 2556#ifdef CONFIG_TASK_IO_ACCOUNTING
2535 INF("io", S_IRUGO, tgid_io_accounting), 2557 INF("io", S_IRUGO, proc_tgid_io_accounting),
2536#endif 2558#endif
2537}; 2559};
2538 2560
@@ -2805,66 +2827,69 @@ out_no_task:
2805 * Tasks 2827 * Tasks
2806 */ 2828 */
2807static const struct pid_entry tid_base_stuff[] = { 2829static const struct pid_entry tid_base_stuff[] = {
2808 DIR("fd", S_IRUSR|S_IXUSR, fd), 2830 DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
2809 DIR("fdinfo", S_IRUSR|S_IXUSR, fdinfo), 2831 DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fd_operations),
2810 REG("environ", S_IRUSR, environ), 2832 REG("environ", S_IRUSR, proc_environ_operations),
2811 INF("auxv", S_IRUSR, pid_auxv), 2833 INF("auxv", S_IRUSR, proc_pid_auxv),
2812 ONE("status", S_IRUGO, pid_status), 2834 ONE("status", S_IRUGO, proc_pid_status),
2813 ONE("personality", S_IRUSR, pid_personality), 2835 ONE("personality", S_IRUSR, proc_pid_personality),
2814 INF("limits", S_IRUSR, pid_limits), 2836 INF("limits", S_IRUSR, proc_pid_limits),
2815#ifdef CONFIG_SCHED_DEBUG 2837#ifdef CONFIG_SCHED_DEBUG
2816 REG("sched", S_IRUGO|S_IWUSR, pid_sched), 2838 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
2817#endif 2839#endif
2818#ifdef CONFIG_HAVE_ARCH_TRACEHOOK 2840#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
2819 INF("syscall", S_IRUSR, pid_syscall), 2841 INF("syscall", S_IRUSR, proc_pid_syscall),
2820#endif 2842#endif
2821 INF("cmdline", S_IRUGO, pid_cmdline), 2843 INF("cmdline", S_IRUGO, proc_pid_cmdline),
2822 ONE("stat", S_IRUGO, tid_stat), 2844 ONE("stat", S_IRUGO, proc_tid_stat),
2823 ONE("statm", S_IRUGO, pid_statm), 2845 ONE("statm", S_IRUGO, proc_pid_statm),
2824 REG("maps", S_IRUGO, maps), 2846 REG("maps", S_IRUGO, proc_maps_operations),
2825#ifdef CONFIG_NUMA 2847#ifdef CONFIG_NUMA
2826 REG("numa_maps", S_IRUGO, numa_maps), 2848 REG("numa_maps", S_IRUGO, proc_numa_maps_operations),
2827#endif 2849#endif
2828 REG("mem", S_IRUSR|S_IWUSR, mem), 2850 REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations),
2829 LNK("cwd", cwd), 2851 LNK("cwd", proc_cwd_link),
2830 LNK("root", root), 2852 LNK("root", proc_root_link),
2831 LNK("exe", exe), 2853 LNK("exe", proc_exe_link),
2832 REG("mounts", S_IRUGO, mounts), 2854 REG("mounts", S_IRUGO, proc_mounts_operations),
2833 REG("mountinfo", S_IRUGO, mountinfo), 2855 REG("mountinfo", S_IRUGO, proc_mountinfo_operations),
2834#ifdef CONFIG_PROC_PAGE_MONITOR 2856#ifdef CONFIG_PROC_PAGE_MONITOR
2835 REG("clear_refs", S_IWUSR, clear_refs), 2857 REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
2836 REG("smaps", S_IRUGO, smaps), 2858 REG("smaps", S_IRUGO, proc_smaps_operations),
2837 REG("pagemap", S_IRUSR, pagemap), 2859 REG("pagemap", S_IRUSR, proc_pagemap_operations),
2838#endif 2860#endif
2839#ifdef CONFIG_SECURITY 2861#ifdef CONFIG_SECURITY
2840 DIR("attr", S_IRUGO|S_IXUGO, attr_dir), 2862 DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
2841#endif 2863#endif
2842#ifdef CONFIG_KALLSYMS 2864#ifdef CONFIG_KALLSYMS
2843 INF("wchan", S_IRUGO, pid_wchan), 2865 INF("wchan", S_IRUGO, proc_pid_wchan),
2866#endif
2867#ifdef CONFIG_STACKTRACE
2868 ONE("stack", S_IRUSR, proc_pid_stack),
2844#endif 2869#endif
2845#ifdef CONFIG_SCHEDSTATS 2870#ifdef CONFIG_SCHEDSTATS
2846 INF("schedstat", S_IRUGO, pid_schedstat), 2871 INF("schedstat", S_IRUGO, proc_pid_schedstat),
2847#endif 2872#endif
2848#ifdef CONFIG_LATENCYTOP 2873#ifdef CONFIG_LATENCYTOP
2849 REG("latency", S_IRUGO, lstats), 2874 REG("latency", S_IRUGO, proc_lstats_operations),
2850#endif 2875#endif
2851#ifdef CONFIG_PROC_PID_CPUSET 2876#ifdef CONFIG_PROC_PID_CPUSET
2852 REG("cpuset", S_IRUGO, cpuset), 2877 REG("cpuset", S_IRUGO, proc_cpuset_operations),
2853#endif 2878#endif
2854#ifdef CONFIG_CGROUPS 2879#ifdef CONFIG_CGROUPS
2855 REG("cgroup", S_IRUGO, cgroup), 2880 REG("cgroup", S_IRUGO, proc_cgroup_operations),
2856#endif 2881#endif
2857 INF("oom_score", S_IRUGO, oom_score), 2882 INF("oom_score", S_IRUGO, proc_oom_score),
2858 REG("oom_adj", S_IRUGO|S_IWUSR, oom_adjust), 2883 REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adjust_operations),
2859#ifdef CONFIG_AUDITSYSCALL 2884#ifdef CONFIG_AUDITSYSCALL
2860 REG("loginuid", S_IWUSR|S_IRUGO, loginuid), 2885 REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations),
2861 REG("sessionid", S_IRUSR, sessionid), 2886 REG("sessionid", S_IRUSR, proc_sessionid_operations),
2862#endif 2887#endif
2863#ifdef CONFIG_FAULT_INJECTION 2888#ifdef CONFIG_FAULT_INJECTION
2864 REG("make-it-fail", S_IRUGO|S_IWUSR, fault_inject), 2889 REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
2865#endif 2890#endif
2866#ifdef CONFIG_TASK_IO_ACCOUNTING 2891#ifdef CONFIG_TASK_IO_ACCOUNTING
2867 INF("io", S_IRUGO, tid_io_accounting), 2892 INF("io", S_IRUGO, proc_tid_io_accounting),
2868#endif 2893#endif
2869}; 2894};
2870 2895
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 60a359b35582..db7fa5cab988 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -14,7 +14,6 @@
14#include <linux/stat.h> 14#include <linux/stat.h>
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/mount.h> 16#include <linux/mount.h>
17#include <linux/smp_lock.h>
18#include <linux/init.h> 17#include <linux/init.h>
19#include <linux/idr.h> 18#include <linux/idr.h>
20#include <linux/namei.h> 19#include <linux/namei.h>
@@ -379,7 +378,6 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
379 struct inode *inode = NULL; 378 struct inode *inode = NULL;
380 int error = -ENOENT; 379 int error = -ENOENT;
381 380
382 lock_kernel();
383 spin_lock(&proc_subdir_lock); 381 spin_lock(&proc_subdir_lock);
384 for (de = de->subdir; de ; de = de->next) { 382 for (de = de->subdir; de ; de = de->next) {
385 if (de->namelen != dentry->d_name.len) 383 if (de->namelen != dentry->d_name.len)
@@ -397,7 +395,6 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
397 } 395 }
398 spin_unlock(&proc_subdir_lock); 396 spin_unlock(&proc_subdir_lock);
399out_unlock: 397out_unlock:
400 unlock_kernel();
401 398
402 if (inode) { 399 if (inode) {
403 dentry->d_op = &proc_dentry_operations; 400 dentry->d_op = &proc_dentry_operations;
@@ -432,8 +429,6 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
432 struct inode *inode = filp->f_path.dentry->d_inode; 429 struct inode *inode = filp->f_path.dentry->d_inode;
433 int ret = 0; 430 int ret = 0;
434 431
435 lock_kernel();
436
437 ino = inode->i_ino; 432 ino = inode->i_ino;
438 i = filp->f_pos; 433 i = filp->f_pos;
439 switch (i) { 434 switch (i) {
@@ -487,7 +482,7 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
487 spin_unlock(&proc_subdir_lock); 482 spin_unlock(&proc_subdir_lock);
488 } 483 }
489 ret = 1; 484 ret = 1;
490out: unlock_kernel(); 485out:
491 return ret; 486 return ret;
492} 487}
493 488
@@ -504,6 +499,7 @@ int proc_readdir(struct file *filp, void *dirent, filldir_t filldir)
504 * the /proc directory. 499 * the /proc directory.
505 */ 500 */
506static const struct file_operations proc_dir_operations = { 501static const struct file_operations proc_dir_operations = {
502 .llseek = generic_file_llseek,
507 .read = generic_read_dir, 503 .read = generic_read_dir,
508 .readdir = proc_readdir, 504 .readdir = proc_readdir,
509}; 505};
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 2543fd00c658..3e76bb9b3ad6 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -35,16 +35,13 @@ struct proc_dir_entry *de_get(struct proc_dir_entry *de)
35 */ 35 */
36void de_put(struct proc_dir_entry *de) 36void de_put(struct proc_dir_entry *de)
37{ 37{
38 lock_kernel();
39 if (!atomic_read(&de->count)) { 38 if (!atomic_read(&de->count)) {
40 printk("de_put: entry %s already free!\n", de->name); 39 printk("de_put: entry %s already free!\n", de->name);
41 unlock_kernel();
42 return; 40 return;
43 } 41 }
44 42
45 if (atomic_dec_and_test(&de->count)) 43 if (atomic_dec_and_test(&de->count))
46 free_proc_entry(de); 44 free_proc_entry(de);
47 unlock_kernel();
48} 45}
49 46
50/* 47/*
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 3e8aeb8b61ce..cd53ff838498 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -41,8 +41,6 @@ do { \
41 (vmi)->used = 0; \ 41 (vmi)->used = 0; \
42 (vmi)->largest_chunk = 0; \ 42 (vmi)->largest_chunk = 0; \
43} while(0) 43} while(0)
44
45extern int nommu_vma_show(struct seq_file *, struct vm_area_struct *);
46#endif 44#endif
47 45
48extern int proc_tid_stat(struct seq_file *m, struct pid_namespace *ns, 46extern int proc_tid_stat(struct seq_file *m, struct pid_namespace *ns,
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index b1675c4e66da..43d23948384a 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -74,6 +74,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
74 "LowTotal: %8lu kB\n" 74 "LowTotal: %8lu kB\n"
75 "LowFree: %8lu kB\n" 75 "LowFree: %8lu kB\n"
76#endif 76#endif
77#ifndef CONFIG_MMU
78 "MmapCopy: %8lu kB\n"
79#endif
77 "SwapTotal: %8lu kB\n" 80 "SwapTotal: %8lu kB\n"
78 "SwapFree: %8lu kB\n" 81 "SwapFree: %8lu kB\n"
79 "Dirty: %8lu kB\n" 82 "Dirty: %8lu kB\n"
@@ -116,6 +119,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
116 K(i.totalram-i.totalhigh), 119 K(i.totalram-i.totalhigh),
117 K(i.freeram-i.freehigh), 120 K(i.freeram-i.freehigh),
118#endif 121#endif
122#ifndef CONFIG_MMU
123 K((unsigned long) atomic_read(&mmap_pages_allocated)),
124#endif
119 K(i.totalswap), 125 K(i.totalswap),
120 K(i.freeswap), 126 K(i.freeswap),
121 K(global_page_state(NR_FILE_DIRTY)), 127 K(global_page_state(NR_FILE_DIRTY)),
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index 3f87d2632947..b446d7ad0b0d 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -33,33 +33,33 @@
33#include "internal.h" 33#include "internal.h"
34 34
35/* 35/*
36 * display a single VMA to a sequenced file 36 * display a single region to a sequenced file
37 */ 37 */
38int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) 38static int nommu_region_show(struct seq_file *m, struct vm_region *region)
39{ 39{
40 unsigned long ino = 0; 40 unsigned long ino = 0;
41 struct file *file; 41 struct file *file;
42 dev_t dev = 0; 42 dev_t dev = 0;
43 int flags, len; 43 int flags, len;
44 44
45 flags = vma->vm_flags; 45 flags = region->vm_flags;
46 file = vma->vm_file; 46 file = region->vm_file;
47 47
48 if (file) { 48 if (file) {
49 struct inode *inode = vma->vm_file->f_path.dentry->d_inode; 49 struct inode *inode = region->vm_file->f_path.dentry->d_inode;
50 dev = inode->i_sb->s_dev; 50 dev = inode->i_sb->s_dev;
51 ino = inode->i_ino; 51 ino = inode->i_ino;
52 } 52 }
53 53
54 seq_printf(m, 54 seq_printf(m,
55 "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n", 55 "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
56 vma->vm_start, 56 region->vm_start,
57 vma->vm_end, 57 region->vm_end,
58 flags & VM_READ ? 'r' : '-', 58 flags & VM_READ ? 'r' : '-',
59 flags & VM_WRITE ? 'w' : '-', 59 flags & VM_WRITE ? 'w' : '-',
60 flags & VM_EXEC ? 'x' : '-', 60 flags & VM_EXEC ? 'x' : '-',
61 flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p', 61 flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p',
62 ((loff_t)vma->vm_pgoff) << PAGE_SHIFT, 62 ((loff_t)region->vm_pgoff) << PAGE_SHIFT,
63 MAJOR(dev), MINOR(dev), ino, &len); 63 MAJOR(dev), MINOR(dev), ino, &len);
64 64
65 if (file) { 65 if (file) {
@@ -75,61 +75,54 @@ int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
75} 75}
76 76
77/* 77/*
78 * display a list of all the VMAs the kernel knows about 78 * display a list of all the REGIONs the kernel knows about
79 * - nommu kernals have a single flat list 79 * - nommu kernals have a single flat list
80 */ 80 */
81static int nommu_vma_list_show(struct seq_file *m, void *v) 81static int nommu_region_list_show(struct seq_file *m, void *_p)
82{ 82{
83 struct vm_area_struct *vma; 83 struct rb_node *p = _p;
84 84
85 vma = rb_entry((struct rb_node *) v, struct vm_area_struct, vm_rb); 85 return nommu_region_show(m, rb_entry(p, struct vm_region, vm_rb));
86 return nommu_vma_show(m, vma);
87} 86}
88 87
89static void *nommu_vma_list_start(struct seq_file *m, loff_t *_pos) 88static void *nommu_region_list_start(struct seq_file *m, loff_t *_pos)
90{ 89{
91 struct rb_node *_rb; 90 struct rb_node *p;
92 loff_t pos = *_pos; 91 loff_t pos = *_pos;
93 void *next = NULL;
94 92
95 down_read(&nommu_vma_sem); 93 down_read(&nommu_region_sem);
96 94
97 for (_rb = rb_first(&nommu_vma_tree); _rb; _rb = rb_next(_rb)) { 95 for (p = rb_first(&nommu_region_tree); p; p = rb_next(p))
98 if (pos == 0) { 96 if (pos-- == 0)
99 next = _rb; 97 return p;
100 break; 98 return NULL;
101 }
102 pos--;
103 }
104
105 return next;
106} 99}
107 100
108static void nommu_vma_list_stop(struct seq_file *m, void *v) 101static void nommu_region_list_stop(struct seq_file *m, void *v)
109{ 102{
110 up_read(&nommu_vma_sem); 103 up_read(&nommu_region_sem);
111} 104}
112 105
113static void *nommu_vma_list_next(struct seq_file *m, void *v, loff_t *pos) 106static void *nommu_region_list_next(struct seq_file *m, void *v, loff_t *pos)
114{ 107{
115 (*pos)++; 108 (*pos)++;
116 return rb_next((struct rb_node *) v); 109 return rb_next((struct rb_node *) v);
117} 110}
118 111
119static const struct seq_operations proc_nommu_vma_list_seqop = { 112static struct seq_operations proc_nommu_region_list_seqop = {
120 .start = nommu_vma_list_start, 113 .start = nommu_region_list_start,
121 .next = nommu_vma_list_next, 114 .next = nommu_region_list_next,
122 .stop = nommu_vma_list_stop, 115 .stop = nommu_region_list_stop,
123 .show = nommu_vma_list_show 116 .show = nommu_region_list_show
124}; 117};
125 118
126static int proc_nommu_vma_list_open(struct inode *inode, struct file *file) 119static int proc_nommu_region_list_open(struct inode *inode, struct file *file)
127{ 120{
128 return seq_open(file, &proc_nommu_vma_list_seqop); 121 return seq_open(file, &proc_nommu_region_list_seqop);
129} 122}
130 123
131static const struct file_operations proc_nommu_vma_list_operations = { 124static const struct file_operations proc_nommu_region_list_operations = {
132 .open = proc_nommu_vma_list_open, 125 .open = proc_nommu_region_list_open,
133 .read = seq_read, 126 .read = seq_read,
134 .llseek = seq_lseek, 127 .llseek = seq_lseek,
135 .release = seq_release, 128 .release = seq_release,
@@ -137,7 +130,7 @@ static const struct file_operations proc_nommu_vma_list_operations = {
137 130
138static int __init proc_nommu_init(void) 131static int __init proc_nommu_init(void)
139{ 132{
140 proc_create("maps", S_IRUGO, NULL, &proc_nommu_vma_list_operations); 133 proc_create("maps", S_IRUGO, NULL, &proc_nommu_region_list_operations);
141 return 0; 134 return 0;
142} 135}
143 136
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 7bc296f424ae..04d1270f1c38 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -18,7 +18,6 @@
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/module.h> 19#include <linux/module.h>
20#include <linux/bitops.h> 20#include <linux/bitops.h>
21#include <linux/smp_lock.h>
22#include <linux/mount.h> 21#include <linux/mount.h>
23#include <linux/nsproxy.h> 22#include <linux/nsproxy.h>
24#include <net/net_namespace.h> 23#include <net/net_namespace.h>
@@ -172,6 +171,7 @@ static int proc_tgid_net_readdir(struct file *filp, void *dirent,
172} 171}
173 172
174const struct file_operations proc_net_operations = { 173const struct file_operations proc_net_operations = {
174 .llseek = generic_file_llseek,
175 .read = generic_read_dir, 175 .read = generic_read_dir,
176 .readdir = proc_tgid_net_readdir, 176 .readdir = proc_tgid_net_readdir,
177}; 177};
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 06ed10b7da9e..94fcfff6863a 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -31,7 +31,6 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
31 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 31 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
32 inode->i_flags |= S_PRIVATE; /* tell selinux to ignore this inode */ 32 inode->i_flags |= S_PRIVATE; /* tell selinux to ignore this inode */
33 inode->i_mode = table->mode; 33 inode->i_mode = table->mode;
34 inode->i_uid = inode->i_gid = 0;
35 if (!table->child) { 34 if (!table->child) {
36 inode->i_mode |= S_IFREG; 35 inode->i_mode |= S_IFREG;
37 inode->i_op = &proc_sys_inode_operations; 36 inode->i_op = &proc_sys_inode_operations;
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 7761602af9de..f6299a25594e 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -16,7 +16,6 @@
16#include <linux/sched.h> 16#include <linux/sched.h>
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/bitops.h> 18#include <linux/bitops.h>
19#include <linux/smp_lock.h>
20#include <linux/mount.h> 19#include <linux/mount.h>
21#include <linux/pid_namespace.h> 20#include <linux/pid_namespace.h>
22 21
@@ -162,17 +161,12 @@ static int proc_root_readdir(struct file * filp,
162 unsigned int nr = filp->f_pos; 161 unsigned int nr = filp->f_pos;
163 int ret; 162 int ret;
164 163
165 lock_kernel();
166
167 if (nr < FIRST_PROCESS_ENTRY) { 164 if (nr < FIRST_PROCESS_ENTRY) {
168 int error = proc_readdir(filp, dirent, filldir); 165 int error = proc_readdir(filp, dirent, filldir);
169 if (error <= 0) { 166 if (error <= 0)
170 unlock_kernel();
171 return error; 167 return error;
172 }
173 filp->f_pos = FIRST_PROCESS_ENTRY; 168 filp->f_pos = FIRST_PROCESS_ENTRY;
174 } 169 }
175 unlock_kernel();
176 170
177 ret = proc_pid_readdir(filp, dirent, filldir); 171 ret = proc_pid_readdir(filp, dirent, filldir);
178 return ret; 172 return ret;
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 3bb1cf1e7425..f75efa22df5e 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -9,6 +9,7 @@
9#include <linux/seq_file.h> 9#include <linux/seq_file.h>
10#include <linux/slab.h> 10#include <linux/slab.h>
11#include <linux/time.h> 11#include <linux/time.h>
12#include <linux/irqnr.h>
12#include <asm/cputime.h> 13#include <asm/cputime.h>
13 14
14#ifndef arch_irq_stat_cpu 15#ifndef arch_irq_stat_cpu
@@ -45,10 +46,6 @@ static int show_stat(struct seq_file *p, void *v)
45 steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal); 46 steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
46 guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest); 47 guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
47 for_each_irq_nr(j) { 48 for_each_irq_nr(j) {
48#ifdef CONFIG_SPARSE_IRQ
49 if (!irq_to_desc(j))
50 continue;
51#endif
52 sum += kstat_irqs_cpu(j, i); 49 sum += kstat_irqs_cpu(j, i);
53 } 50 }
54 sum += arch_irq_stat_cpu(i); 51 sum += arch_irq_stat_cpu(i);
@@ -95,12 +92,6 @@ static int show_stat(struct seq_file *p, void *v)
95 /* sum again ? it could be updated? */ 92 /* sum again ? it could be updated? */
96 for_each_irq_nr(j) { 93 for_each_irq_nr(j) {
97 per_irq_sum = 0; 94 per_irq_sum = 0;
98#ifdef CONFIG_SPARSE_IRQ
99 if (!irq_to_desc(j)) {
100 seq_printf(p, " %u", per_irq_sum);
101 continue;
102 }
103#endif
104 for_each_possible_cpu(i) 95 for_each_possible_cpu(i)
105 per_irq_sum += kstat_irqs_cpu(j, i); 96 per_irq_sum += kstat_irqs_cpu(j, i);
106 97
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 3a8bdd7f5756..94063840832a 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -396,7 +396,9 @@ static int show_smap(struct seq_file *m, void *v)
396 "Private_Clean: %8lu kB\n" 396 "Private_Clean: %8lu kB\n"
397 "Private_Dirty: %8lu kB\n" 397 "Private_Dirty: %8lu kB\n"
398 "Referenced: %8lu kB\n" 398 "Referenced: %8lu kB\n"
399 "Swap: %8lu kB\n", 399 "Swap: %8lu kB\n"
400 "KernelPageSize: %8lu kB\n"
401 "MMUPageSize: %8lu kB\n",
400 (vma->vm_end - vma->vm_start) >> 10, 402 (vma->vm_end - vma->vm_start) >> 10,
401 mss.resident >> 10, 403 mss.resident >> 10,
402 (unsigned long)(mss.pss >> (10 + PSS_SHIFT)), 404 (unsigned long)(mss.pss >> (10 + PSS_SHIFT)),
@@ -405,7 +407,9 @@ static int show_smap(struct seq_file *m, void *v)
405 mss.private_clean >> 10, 407 mss.private_clean >> 10,
406 mss.private_dirty >> 10, 408 mss.private_dirty >> 10,
407 mss.referenced >> 10, 409 mss.referenced >> 10,
408 mss.swap >> 10); 410 mss.swap >> 10,
411 vma_kernel_pagesize(vma) >> 10,
412 vma_mmu_pagesize(vma) >> 10);
409 413
410 if (m->count < m->size) /* vma is copied successfully */ 414 if (m->count < m->size) /* vma is copied successfully */
411 m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0; 415 m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 219bd79ea894..343ea1216bc8 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -9,31 +9,38 @@
9 9
10/* 10/*
11 * Logic: we've got two memory sums for each process, "shared", and 11 * Logic: we've got two memory sums for each process, "shared", and
12 * "non-shared". Shared memory may get counted more then once, for 12 * "non-shared". Shared memory may get counted more than once, for
13 * each process that owns it. Non-shared memory is counted 13 * each process that owns it. Non-shared memory is counted
14 * accurately. 14 * accurately.
15 */ 15 */
16void task_mem(struct seq_file *m, struct mm_struct *mm) 16void task_mem(struct seq_file *m, struct mm_struct *mm)
17{ 17{
18 struct vm_list_struct *vml; 18 struct vm_area_struct *vma;
19 unsigned long bytes = 0, sbytes = 0, slack = 0; 19 struct vm_region *region;
20 struct rb_node *p;
21 unsigned long bytes = 0, sbytes = 0, slack = 0, size;
20 22
21 down_read(&mm->mmap_sem); 23 down_read(&mm->mmap_sem);
22 for (vml = mm->context.vmlist; vml; vml = vml->next) { 24 for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
23 if (!vml->vma) 25 vma = rb_entry(p, struct vm_area_struct, vm_rb);
24 continue; 26
27 bytes += kobjsize(vma);
28
29 region = vma->vm_region;
30 if (region) {
31 size = kobjsize(region);
32 size += region->vm_end - region->vm_start;
33 } else {
34 size = vma->vm_end - vma->vm_start;
35 }
25 36
26 bytes += kobjsize(vml);
27 if (atomic_read(&mm->mm_count) > 1 || 37 if (atomic_read(&mm->mm_count) > 1 ||
28 atomic_read(&vml->vma->vm_usage) > 1 38 vma->vm_flags & VM_MAYSHARE) {
29 ) { 39 sbytes += size;
30 sbytes += kobjsize((void *) vml->vma->vm_start);
31 sbytes += kobjsize(vml->vma);
32 } else { 40 } else {
33 bytes += kobjsize((void *) vml->vma->vm_start); 41 bytes += size;
34 bytes += kobjsize(vml->vma); 42 if (region)
35 slack += kobjsize((void *) vml->vma->vm_start) - 43 slack = region->vm_end - vma->vm_end;
36 (vml->vma->vm_end - vml->vma->vm_start);
37 } 44 }
38 } 45 }
39 46
@@ -70,13 +77,14 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
70 77
71unsigned long task_vsize(struct mm_struct *mm) 78unsigned long task_vsize(struct mm_struct *mm)
72{ 79{
73 struct vm_list_struct *tbp; 80 struct vm_area_struct *vma;
81 struct rb_node *p;
74 unsigned long vsize = 0; 82 unsigned long vsize = 0;
75 83
76 down_read(&mm->mmap_sem); 84 down_read(&mm->mmap_sem);
77 for (tbp = mm->context.vmlist; tbp; tbp = tbp->next) { 85 for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
78 if (tbp->vma) 86 vma = rb_entry(p, struct vm_area_struct, vm_rb);
79 vsize += kobjsize((void *) tbp->vma->vm_start); 87 vsize += vma->vm_end - vma->vm_start;
80 } 88 }
81 up_read(&mm->mmap_sem); 89 up_read(&mm->mmap_sem);
82 return vsize; 90 return vsize;
@@ -85,15 +93,19 @@ unsigned long task_vsize(struct mm_struct *mm)
85int task_statm(struct mm_struct *mm, int *shared, int *text, 93int task_statm(struct mm_struct *mm, int *shared, int *text,
86 int *data, int *resident) 94 int *data, int *resident)
87{ 95{
88 struct vm_list_struct *tbp; 96 struct vm_area_struct *vma;
97 struct vm_region *region;
98 struct rb_node *p;
89 int size = kobjsize(mm); 99 int size = kobjsize(mm);
90 100
91 down_read(&mm->mmap_sem); 101 down_read(&mm->mmap_sem);
92 for (tbp = mm->context.vmlist; tbp; tbp = tbp->next) { 102 for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
93 size += kobjsize(tbp); 103 vma = rb_entry(p, struct vm_area_struct, vm_rb);
94 if (tbp->vma) { 104 size += kobjsize(vma);
95 size += kobjsize(tbp->vma); 105 region = vma->vm_region;
96 size += kobjsize((void *) tbp->vma->vm_start); 106 if (region) {
107 size += kobjsize(region);
108 size += region->vm_end - region->vm_start;
97 } 109 }
98 } 110 }
99 111
@@ -105,20 +117,62 @@ int task_statm(struct mm_struct *mm, int *shared, int *text,
105} 117}
106 118
107/* 119/*
120 * display a single VMA to a sequenced file
121 */
122static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
123{
124 unsigned long ino = 0;
125 struct file *file;
126 dev_t dev = 0;
127 int flags, len;
128
129 flags = vma->vm_flags;
130 file = vma->vm_file;
131
132 if (file) {
133 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
134 dev = inode->i_sb->s_dev;
135 ino = inode->i_ino;
136 }
137
138 seq_printf(m,
139 "%08lx-%08lx %c%c%c%c %08lx %02x:%02x %lu %n",
140 vma->vm_start,
141 vma->vm_end,
142 flags & VM_READ ? 'r' : '-',
143 flags & VM_WRITE ? 'w' : '-',
144 flags & VM_EXEC ? 'x' : '-',
145 flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p',
146 vma->vm_pgoff << PAGE_SHIFT,
147 MAJOR(dev), MINOR(dev), ino, &len);
148
149 if (file) {
150 len = 25 + sizeof(void *) * 6 - len;
151 if (len < 1)
152 len = 1;
153 seq_printf(m, "%*c", len, ' ');
154 seq_path(m, &file->f_path, "");
155 }
156
157 seq_putc(m, '\n');
158 return 0;
159}
160
161/*
108 * display mapping lines for a particular process's /proc/pid/maps 162 * display mapping lines for a particular process's /proc/pid/maps
109 */ 163 */
110static int show_map(struct seq_file *m, void *_vml) 164static int show_map(struct seq_file *m, void *_p)
111{ 165{
112 struct vm_list_struct *vml = _vml; 166 struct rb_node *p = _p;
113 167
114 return nommu_vma_show(m, vml->vma); 168 return nommu_vma_show(m, rb_entry(p, struct vm_area_struct, vm_rb));
115} 169}
116 170
117static void *m_start(struct seq_file *m, loff_t *pos) 171static void *m_start(struct seq_file *m, loff_t *pos)
118{ 172{
119 struct proc_maps_private *priv = m->private; 173 struct proc_maps_private *priv = m->private;
120 struct vm_list_struct *vml;
121 struct mm_struct *mm; 174 struct mm_struct *mm;
175 struct rb_node *p;
122 loff_t n = *pos; 176 loff_t n = *pos;
123 177
124 /* pin the task and mm whilst we play with them */ 178 /* pin the task and mm whilst we play with them */
@@ -134,9 +188,9 @@ static void *m_start(struct seq_file *m, loff_t *pos)
134 } 188 }
135 189
136 /* start from the Nth VMA */ 190 /* start from the Nth VMA */
137 for (vml = mm->context.vmlist; vml; vml = vml->next) 191 for (p = rb_first(&mm->mm_rb); p; p = rb_next(p))
138 if (n-- == 0) 192 if (n-- == 0)
139 return vml; 193 return p;
140 return NULL; 194 return NULL;
141} 195}
142 196
@@ -152,12 +206,12 @@ static void m_stop(struct seq_file *m, void *_vml)
152 } 206 }
153} 207}
154 208
155static void *m_next(struct seq_file *m, void *_vml, loff_t *pos) 209static void *m_next(struct seq_file *m, void *_p, loff_t *pos)
156{ 210{
157 struct vm_list_struct *vml = _vml; 211 struct rb_node *p = _p;
158 212
159 (*pos)++; 213 (*pos)++;
160 return vml ? vml->next : NULL; 214 return p ? rb_next(p) : NULL;
161} 215}
162 216
163static const struct seq_operations proc_pid_maps_ops = { 217static const struct seq_operations proc_pid_maps_ops = {
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 03ec59504906..5edcc3f92ba7 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -47,8 +47,6 @@ static ssize_t read_from_oldmem(char *buf, size_t count,
47 47
48 offset = (unsigned long)(*ppos % PAGE_SIZE); 48 offset = (unsigned long)(*ppos % PAGE_SIZE);
49 pfn = (unsigned long)(*ppos / PAGE_SIZE); 49 pfn = (unsigned long)(*ppos / PAGE_SIZE);
50 if (pfn > saved_max_pfn)
51 return -EINVAL;
52 50
53 do { 51 do {
54 if (count > (PAGE_SIZE - offset)) 52 if (count > (PAGE_SIZE - offset))
diff --git a/fs/qnx4/Kconfig b/fs/qnx4/Kconfig
new file mode 100644
index 000000000000..be8e0e1445b6
--- /dev/null
+++ b/fs/qnx4/Kconfig
@@ -0,0 +1,25 @@
1config QNX4FS_FS
2 tristate "QNX4 file system support (read only)"
3 depends on BLOCK
4 help
5 This is the file system used by the real-time operating systems
6 QNX 4 and QNX 6 (the latter is also called QNX RTP).
7 Further information is available at <http://www.qnx.com/>.
8 Say Y if you intend to mount QNX hard disks or floppies.
9 Unless you say Y to "QNX4FS read-write support" below, you will
10 only be able to read these file systems.
11
12 To compile this file system support as a module, choose M here: the
13 module will be called qnx4.
14
15 If you don't know whether you need it, then you don't need it:
16 answer N.
17
18config QNX4FS_RW
19 bool "QNX4FS write support (DANGEROUS)"
20 depends on QNX4FS_FS && EXPERIMENTAL && BROKEN
21 help
22 Say Y if you want to test write support for QNX4 file systems.
23
24 It's currently broken, so for now:
25 answer N.
diff --git a/fs/quota.c b/fs/quota.c
index b7fe44e01618..d76ada914f98 100644
--- a/fs/quota.c
+++ b/fs/quota.c
@@ -73,7 +73,7 @@ static int generic_quotactl_valid(struct super_block *sb, int type, int cmd, qid
73 case Q_SETQUOTA: 73 case Q_SETQUOTA:
74 case Q_GETQUOTA: 74 case Q_GETQUOTA:
75 /* This is just informative test so we are satisfied without a lock */ 75 /* This is just informative test so we are satisfied without a lock */
76 if (!sb_has_quota_enabled(sb, type)) 76 if (!sb_has_quota_active(sb, type))
77 return -ESRCH; 77 return -ESRCH;
78 } 78 }
79 79
@@ -160,6 +160,9 @@ static void quota_sync_sb(struct super_block *sb, int type)
160 int cnt; 160 int cnt;
161 161
162 sb->s_qcop->quota_sync(sb, type); 162 sb->s_qcop->quota_sync(sb, type);
163
164 if (sb_dqopt(sb)->flags & DQUOT_QUOTA_SYS_FILE)
165 return;
163 /* This is not very clever (and fast) but currently I don't know about 166 /* This is not very clever (and fast) but currently I don't know about
164 * any other simple way of getting quota data to disk and we must get 167 * any other simple way of getting quota data to disk and we must get
165 * them there for userspace to be visible... */ 168 * them there for userspace to be visible... */
@@ -175,7 +178,7 @@ static void quota_sync_sb(struct super_block *sb, int type)
175 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 178 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
176 if (type != -1 && cnt != type) 179 if (type != -1 && cnt != type)
177 continue; 180 continue;
178 if (!sb_has_quota_enabled(sb, cnt)) 181 if (!sb_has_quota_active(sb, cnt))
179 continue; 182 continue;
180 mutex_lock_nested(&sb_dqopt(sb)->files[cnt]->i_mutex, I_MUTEX_QUOTA); 183 mutex_lock_nested(&sb_dqopt(sb)->files[cnt]->i_mutex, I_MUTEX_QUOTA);
181 truncate_inode_pages(&sb_dqopt(sb)->files[cnt]->i_data, 0); 184 truncate_inode_pages(&sb_dqopt(sb)->files[cnt]->i_data, 0);
@@ -201,7 +204,7 @@ restart:
201 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 204 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
202 if (type != -1 && type != cnt) 205 if (type != -1 && type != cnt)
203 continue; 206 continue;
204 if (!sb_has_quota_enabled(sb, cnt)) 207 if (!sb_has_quota_active(sb, cnt))
205 continue; 208 continue;
206 if (!info_dirty(&sb_dqopt(sb)->info[cnt]) && 209 if (!info_dirty(&sb_dqopt(sb)->info[cnt]) &&
207 list_empty(&sb_dqopt(sb)->info[cnt].dqi_dirty_list)) 210 list_empty(&sb_dqopt(sb)->info[cnt].dqi_dirty_list))
@@ -245,7 +248,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, void
245 __u32 fmt; 248 __u32 fmt;
246 249
247 down_read(&sb_dqopt(sb)->dqptr_sem); 250 down_read(&sb_dqopt(sb)->dqptr_sem);
248 if (!sb_has_quota_enabled(sb, type)) { 251 if (!sb_has_quota_active(sb, type)) {
249 up_read(&sb_dqopt(sb)->dqptr_sem); 252 up_read(&sb_dqopt(sb)->dqptr_sem);
250 return -ESRCH; 253 return -ESRCH;
251 } 254 }
@@ -368,7 +371,8 @@ static inline struct super_block *quotactl_block(const char __user *special)
368 * calls. Maybe we need to add the process quotas etc. in the future, 371 * calls. Maybe we need to add the process quotas etc. in the future,
369 * but we probably should use rlimits for that. 372 * but we probably should use rlimits for that.
370 */ 373 */
371asmlinkage long sys_quotactl(unsigned int cmd, const char __user *special, qid_t id, void __user *addr) 374SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special,
375 qid_t, id, void __user *, addr)
372{ 376{
373 uint cmds, type; 377 uint cmds, type;
374 struct super_block *sb = NULL; 378 struct super_block *sb = NULL;
diff --git a/fs/quota_tree.c b/fs/quota_tree.c
new file mode 100644
index 000000000000..953404c95b17
--- /dev/null
+++ b/fs/quota_tree.c
@@ -0,0 +1,645 @@
1/*
2 * vfsv0 quota IO operations on file
3 */
4
5#include <linux/errno.h>
6#include <linux/fs.h>
7#include <linux/mount.h>
8#include <linux/dqblk_v2.h>
9#include <linux/kernel.h>
10#include <linux/init.h>
11#include <linux/module.h>
12#include <linux/slab.h>
13#include <linux/quotaops.h>
14
15#include <asm/byteorder.h>
16
17#include "quota_tree.h"
18
19MODULE_AUTHOR("Jan Kara");
20MODULE_DESCRIPTION("Quota trie support");
21MODULE_LICENSE("GPL");
22
23#define __QUOTA_QT_PARANOIA
24
25typedef char *dqbuf_t;
26
27static int get_index(struct qtree_mem_dqinfo *info, qid_t id, int depth)
28{
29 unsigned int epb = info->dqi_usable_bs >> 2;
30
31 depth = info->dqi_qtree_depth - depth - 1;
32 while (depth--)
33 id /= epb;
34 return id % epb;
35}
36
37/* Number of entries in one blocks */
38static inline int qtree_dqstr_in_blk(struct qtree_mem_dqinfo *info)
39{
40 return (info->dqi_usable_bs - sizeof(struct qt_disk_dqdbheader))
41 / info->dqi_entry_size;
42}
43
44static dqbuf_t getdqbuf(size_t size)
45{
46 dqbuf_t buf = kmalloc(size, GFP_NOFS);
47 if (!buf)
48 printk(KERN_WARNING "VFS: Not enough memory for quota buffers.\n");
49 return buf;
50}
51
52static inline void freedqbuf(dqbuf_t buf)
53{
54 kfree(buf);
55}
56
57static inline ssize_t read_blk(struct qtree_mem_dqinfo *info, uint blk, dqbuf_t buf)
58{
59 struct super_block *sb = info->dqi_sb;
60
61 memset(buf, 0, info->dqi_usable_bs);
62 return sb->s_op->quota_read(sb, info->dqi_type, (char *)buf,
63 info->dqi_usable_bs, blk << info->dqi_blocksize_bits);
64}
65
66static inline ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, dqbuf_t buf)
67{
68 struct super_block *sb = info->dqi_sb;
69
70 return sb->s_op->quota_write(sb, info->dqi_type, (char *)buf,
71 info->dqi_usable_bs, blk << info->dqi_blocksize_bits);
72}
73
74/* Remove empty block from list and return it */
75static int get_free_dqblk(struct qtree_mem_dqinfo *info)
76{
77 dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
78 struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
79 int ret, blk;
80
81 if (!buf)
82 return -ENOMEM;
83 if (info->dqi_free_blk) {
84 blk = info->dqi_free_blk;
85 ret = read_blk(info, blk, buf);
86 if (ret < 0)
87 goto out_buf;
88 info->dqi_free_blk = le32_to_cpu(dh->dqdh_next_free);
89 }
90 else {
91 memset(buf, 0, info->dqi_usable_bs);
92 /* Assure block allocation... */
93 ret = write_blk(info, info->dqi_blocks, buf);
94 if (ret < 0)
95 goto out_buf;
96 blk = info->dqi_blocks++;
97 }
98 mark_info_dirty(info->dqi_sb, info->dqi_type);
99 ret = blk;
100out_buf:
101 freedqbuf(buf);
102 return ret;
103}
104
105/* Insert empty block to the list */
106static int put_free_dqblk(struct qtree_mem_dqinfo *info, dqbuf_t buf, uint blk)
107{
108 struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
109 int err;
110
111 dh->dqdh_next_free = cpu_to_le32(info->dqi_free_blk);
112 dh->dqdh_prev_free = cpu_to_le32(0);
113 dh->dqdh_entries = cpu_to_le16(0);
114 err = write_blk(info, blk, buf);
115 if (err < 0)
116 return err;
117 info->dqi_free_blk = blk;
118 mark_info_dirty(info->dqi_sb, info->dqi_type);
119 return 0;
120}
121
122/* Remove given block from the list of blocks with free entries */
123static int remove_free_dqentry(struct qtree_mem_dqinfo *info, dqbuf_t buf, uint blk)
124{
125 dqbuf_t tmpbuf = getdqbuf(info->dqi_usable_bs);
126 struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
127 uint nextblk = le32_to_cpu(dh->dqdh_next_free);
128 uint prevblk = le32_to_cpu(dh->dqdh_prev_free);
129 int err;
130
131 if (!tmpbuf)
132 return -ENOMEM;
133 if (nextblk) {
134 err = read_blk(info, nextblk, tmpbuf);
135 if (err < 0)
136 goto out_buf;
137 ((struct qt_disk_dqdbheader *)tmpbuf)->dqdh_prev_free =
138 dh->dqdh_prev_free;
139 err = write_blk(info, nextblk, tmpbuf);
140 if (err < 0)
141 goto out_buf;
142 }
143 if (prevblk) {
144 err = read_blk(info, prevblk, tmpbuf);
145 if (err < 0)
146 goto out_buf;
147 ((struct qt_disk_dqdbheader *)tmpbuf)->dqdh_next_free =
148 dh->dqdh_next_free;
149 err = write_blk(info, prevblk, tmpbuf);
150 if (err < 0)
151 goto out_buf;
152 } else {
153 info->dqi_free_entry = nextblk;
154 mark_info_dirty(info->dqi_sb, info->dqi_type);
155 }
156 freedqbuf(tmpbuf);
157 dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0);
158 /* No matter whether write succeeds block is out of list */
159 if (write_blk(info, blk, buf) < 0)
160 printk(KERN_ERR "VFS: Can't write block (%u) with free entries.\n", blk);
161 return 0;
162out_buf:
163 freedqbuf(tmpbuf);
164 return err;
165}
166
167/* Insert given block to the beginning of list with free entries */
168static int insert_free_dqentry(struct qtree_mem_dqinfo *info, dqbuf_t buf, uint blk)
169{
170 dqbuf_t tmpbuf = getdqbuf(info->dqi_usable_bs);
171 struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
172 int err;
173
174 if (!tmpbuf)
175 return -ENOMEM;
176 dh->dqdh_next_free = cpu_to_le32(info->dqi_free_entry);
177 dh->dqdh_prev_free = cpu_to_le32(0);
178 err = write_blk(info, blk, buf);
179 if (err < 0)
180 goto out_buf;
181 if (info->dqi_free_entry) {
182 err = read_blk(info, info->dqi_free_entry, tmpbuf);
183 if (err < 0)
184 goto out_buf;
185 ((struct qt_disk_dqdbheader *)tmpbuf)->dqdh_prev_free =
186 cpu_to_le32(blk);
187 err = write_blk(info, info->dqi_free_entry, tmpbuf);
188 if (err < 0)
189 goto out_buf;
190 }
191 freedqbuf(tmpbuf);
192 info->dqi_free_entry = blk;
193 mark_info_dirty(info->dqi_sb, info->dqi_type);
194 return 0;
195out_buf:
196 freedqbuf(tmpbuf);
197 return err;
198}
199
200/* Is the entry in the block free? */
201int qtree_entry_unused(struct qtree_mem_dqinfo *info, char *disk)
202{
203 int i;
204
205 for (i = 0; i < info->dqi_entry_size; i++)
206 if (disk[i])
207 return 0;
208 return 1;
209}
210EXPORT_SYMBOL(qtree_entry_unused);
211
212/* Find space for dquot */
213static uint find_free_dqentry(struct qtree_mem_dqinfo *info,
214 struct dquot *dquot, int *err)
215{
216 uint blk, i;
217 struct qt_disk_dqdbheader *dh;
218 dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
219 char *ddquot;
220
221 *err = 0;
222 if (!buf) {
223 *err = -ENOMEM;
224 return 0;
225 }
226 dh = (struct qt_disk_dqdbheader *)buf;
227 if (info->dqi_free_entry) {
228 blk = info->dqi_free_entry;
229 *err = read_blk(info, blk, buf);
230 if (*err < 0)
231 goto out_buf;
232 } else {
233 blk = get_free_dqblk(info);
234 if ((int)blk < 0) {
235 *err = blk;
236 freedqbuf(buf);
237 return 0;
238 }
239 memset(buf, 0, info->dqi_usable_bs);
240 /* This is enough as block is already zeroed and entry list is empty... */
241 info->dqi_free_entry = blk;
242 mark_info_dirty(dquot->dq_sb, dquot->dq_type);
243 }
244 /* Block will be full? */
245 if (le16_to_cpu(dh->dqdh_entries) + 1 >= qtree_dqstr_in_blk(info)) {
246 *err = remove_free_dqentry(info, buf, blk);
247 if (*err < 0) {
248 printk(KERN_ERR "VFS: find_free_dqentry(): Can't "
249 "remove block (%u) from entry free list.\n",
250 blk);
251 goto out_buf;
252 }
253 }
254 le16_add_cpu(&dh->dqdh_entries, 1);
255 /* Find free structure in block */
256 for (i = 0, ddquot = ((char *)buf) + sizeof(struct qt_disk_dqdbheader);
257 i < qtree_dqstr_in_blk(info) && !qtree_entry_unused(info, ddquot);
258 i++, ddquot += info->dqi_entry_size);
259#ifdef __QUOTA_QT_PARANOIA
260 if (i == qtree_dqstr_in_blk(info)) {
261 printk(KERN_ERR "VFS: find_free_dqentry(): Data block full "
262 "but it shouldn't.\n");
263 *err = -EIO;
264 goto out_buf;
265 }
266#endif
267 *err = write_blk(info, blk, buf);
268 if (*err < 0) {
269 printk(KERN_ERR "VFS: find_free_dqentry(): Can't write quota "
270 "data block %u.\n", blk);
271 goto out_buf;
272 }
273 dquot->dq_off = (blk << info->dqi_blocksize_bits) +
274 sizeof(struct qt_disk_dqdbheader) +
275 i * info->dqi_entry_size;
276 freedqbuf(buf);
277 return blk;
278out_buf:
279 freedqbuf(buf);
280 return 0;
281}
282
283/* Insert reference to structure into the trie */
284static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
285 uint *treeblk, int depth)
286{
287 dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
288 int ret = 0, newson = 0, newact = 0;
289 __le32 *ref;
290 uint newblk;
291
292 if (!buf)
293 return -ENOMEM;
294 if (!*treeblk) {
295 ret = get_free_dqblk(info);
296 if (ret < 0)
297 goto out_buf;
298 *treeblk = ret;
299 memset(buf, 0, info->dqi_usable_bs);
300 newact = 1;
301 } else {
302 ret = read_blk(info, *treeblk, buf);
303 if (ret < 0) {
304 printk(KERN_ERR "VFS: Can't read tree quota block "
305 "%u.\n", *treeblk);
306 goto out_buf;
307 }
308 }
309 ref = (__le32 *)buf;
310 newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
311 if (!newblk)
312 newson = 1;
313 if (depth == info->dqi_qtree_depth - 1) {
314#ifdef __QUOTA_QT_PARANOIA
315 if (newblk) {
316 printk(KERN_ERR "VFS: Inserting already present quota "
317 "entry (block %u).\n",
318 le32_to_cpu(ref[get_index(info,
319 dquot->dq_id, depth)]));
320 ret = -EIO;
321 goto out_buf;
322 }
323#endif
324 newblk = find_free_dqentry(info, dquot, &ret);
325 } else {
326 ret = do_insert_tree(info, dquot, &newblk, depth+1);
327 }
328 if (newson && ret >= 0) {
329 ref[get_index(info, dquot->dq_id, depth)] =
330 cpu_to_le32(newblk);
331 ret = write_blk(info, *treeblk, buf);
332 } else if (newact && ret < 0) {
333 put_free_dqblk(info, buf, *treeblk);
334 }
335out_buf:
336 freedqbuf(buf);
337 return ret;
338}
339
340/* Wrapper for inserting quota structure into tree */
341static inline int dq_insert_tree(struct qtree_mem_dqinfo *info,
342 struct dquot *dquot)
343{
344 int tmp = QT_TREEOFF;
345 return do_insert_tree(info, dquot, &tmp, 0);
346}
347
348/*
349 * We don't have to be afraid of deadlocks as we never have quotas on quota files...
350 */
351int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
352{
353 int type = dquot->dq_type;
354 struct super_block *sb = dquot->dq_sb;
355 ssize_t ret;
356 dqbuf_t ddquot = getdqbuf(info->dqi_entry_size);
357
358 if (!ddquot)
359 return -ENOMEM;
360
361 /* dq_off is guarded by dqio_mutex */
362 if (!dquot->dq_off) {
363 ret = dq_insert_tree(info, dquot);
364 if (ret < 0) {
365 printk(KERN_ERR "VFS: Error %zd occurred while "
366 "creating quota.\n", ret);
367 freedqbuf(ddquot);
368 return ret;
369 }
370 }
371 spin_lock(&dq_data_lock);
372 info->dqi_ops->mem2disk_dqblk(ddquot, dquot);
373 spin_unlock(&dq_data_lock);
374 ret = sb->s_op->quota_write(sb, type, (char *)ddquot,
375 info->dqi_entry_size, dquot->dq_off);
376 if (ret != info->dqi_entry_size) {
377 printk(KERN_WARNING "VFS: dquota write failed on dev %s\n",
378 sb->s_id);
379 if (ret >= 0)
380 ret = -ENOSPC;
381 } else {
382 ret = 0;
383 }
384 dqstats.writes++;
385 freedqbuf(ddquot);
386
387 return ret;
388}
389EXPORT_SYMBOL(qtree_write_dquot);
390
391/* Free dquot entry in data block */
392static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
393 uint blk)
394{
395 struct qt_disk_dqdbheader *dh;
396 dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
397 int ret = 0;
398
399 if (!buf)
400 return -ENOMEM;
401 if (dquot->dq_off >> info->dqi_blocksize_bits != blk) {
402 printk(KERN_ERR "VFS: Quota structure has offset to other "
403 "block (%u) than it should (%u).\n", blk,
404 (uint)(dquot->dq_off >> info->dqi_blocksize_bits));
405 goto out_buf;
406 }
407 ret = read_blk(info, blk, buf);
408 if (ret < 0) {
409 printk(KERN_ERR "VFS: Can't read quota data block %u\n", blk);
410 goto out_buf;
411 }
412 dh = (struct qt_disk_dqdbheader *)buf;
413 le16_add_cpu(&dh->dqdh_entries, -1);
414 if (!le16_to_cpu(dh->dqdh_entries)) { /* Block got free? */
415 ret = remove_free_dqentry(info, buf, blk);
416 if (ret >= 0)
417 ret = put_free_dqblk(info, buf, blk);
418 if (ret < 0) {
419 printk(KERN_ERR "VFS: Can't move quota data block (%u) "
420 "to free list.\n", blk);
421 goto out_buf;
422 }
423 } else {
424 memset(buf +
425 (dquot->dq_off & ((1 << info->dqi_blocksize_bits) - 1)),
426 0, info->dqi_entry_size);
427 if (le16_to_cpu(dh->dqdh_entries) ==
428 qtree_dqstr_in_blk(info) - 1) {
429 /* Insert will write block itself */
430 ret = insert_free_dqentry(info, buf, blk);
431 if (ret < 0) {
432 printk(KERN_ERR "VFS: Can't insert quota data "
433 "block (%u) to free entry list.\n", blk);
434 goto out_buf;
435 }
436 } else {
437 ret = write_blk(info, blk, buf);
438 if (ret < 0) {
439 printk(KERN_ERR "VFS: Can't write quota data "
440 "block %u\n", blk);
441 goto out_buf;
442 }
443 }
444 }
445 dquot->dq_off = 0; /* Quota is now unattached */
446out_buf:
447 freedqbuf(buf);
448 return ret;
449}
450
451/* Remove reference to dquot from tree */
452static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
453 uint *blk, int depth)
454{
455 dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
456 int ret = 0;
457 uint newblk;
458 __le32 *ref = (__le32 *)buf;
459
460 if (!buf)
461 return -ENOMEM;
462 ret = read_blk(info, *blk, buf);
463 if (ret < 0) {
464 printk(KERN_ERR "VFS: Can't read quota data block %u\n", *blk);
465 goto out_buf;
466 }
467 newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
468 if (depth == info->dqi_qtree_depth - 1) {
469 ret = free_dqentry(info, dquot, newblk);
470 newblk = 0;
471 } else {
472 ret = remove_tree(info, dquot, &newblk, depth+1);
473 }
474 if (ret >= 0 && !newblk) {
475 int i;
476 ref[get_index(info, dquot->dq_id, depth)] = cpu_to_le32(0);
477 /* Block got empty? */
478 for (i = 0;
479 i < (info->dqi_usable_bs >> 2) && !ref[i];
480 i++);
481 /* Don't put the root block into the free block list */
482 if (i == (info->dqi_usable_bs >> 2)
483 && *blk != QT_TREEOFF) {
484 put_free_dqblk(info, buf, *blk);
485 *blk = 0;
486 } else {
487 ret = write_blk(info, *blk, buf);
488 if (ret < 0)
489 printk(KERN_ERR "VFS: Can't write quota tree "
490 "block %u.\n", *blk);
491 }
492 }
493out_buf:
494 freedqbuf(buf);
495 return ret;
496}
497
498/* Delete dquot from tree */
499int qtree_delete_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
500{
501 uint tmp = QT_TREEOFF;
502
503 if (!dquot->dq_off) /* Even not allocated? */
504 return 0;
505 return remove_tree(info, dquot, &tmp, 0);
506}
507EXPORT_SYMBOL(qtree_delete_dquot);
508
509/* Find entry in block */
510static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info,
511 struct dquot *dquot, uint blk)
512{
513 dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
514 loff_t ret = 0;
515 int i;
516 char *ddquot;
517
518 if (!buf)
519 return -ENOMEM;
520 ret = read_blk(info, blk, buf);
521 if (ret < 0) {
522 printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
523 goto out_buf;
524 }
525 for (i = 0, ddquot = ((char *)buf) + sizeof(struct qt_disk_dqdbheader);
526 i < qtree_dqstr_in_blk(info) && !info->dqi_ops->is_id(ddquot, dquot);
527 i++, ddquot += info->dqi_entry_size);
528 if (i == qtree_dqstr_in_blk(info)) {
529 printk(KERN_ERR "VFS: Quota for id %u referenced "
530 "but not present.\n", dquot->dq_id);
531 ret = -EIO;
532 goto out_buf;
533 } else {
534 ret = (blk << info->dqi_blocksize_bits) + sizeof(struct
535 qt_disk_dqdbheader) + i * info->dqi_entry_size;
536 }
537out_buf:
538 freedqbuf(buf);
539 return ret;
540}
541
542/* Find entry for given id in the tree */
543static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info,
544 struct dquot *dquot, uint blk, int depth)
545{
546 dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
547 loff_t ret = 0;
548 __le32 *ref = (__le32 *)buf;
549
550 if (!buf)
551 return -ENOMEM;
552 ret = read_blk(info, blk, buf);
553 if (ret < 0) {
554 printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
555 goto out_buf;
556 }
557 ret = 0;
558 blk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
559 if (!blk) /* No reference? */
560 goto out_buf;
561 if (depth < info->dqi_qtree_depth - 1)
562 ret = find_tree_dqentry(info, dquot, blk, depth+1);
563 else
564 ret = find_block_dqentry(info, dquot, blk);
565out_buf:
566 freedqbuf(buf);
567 return ret;
568}
569
570/* Find entry for given id in the tree - wrapper function */
571static inline loff_t find_dqentry(struct qtree_mem_dqinfo *info,
572 struct dquot *dquot)
573{
574 return find_tree_dqentry(info, dquot, QT_TREEOFF, 0);
575}
576
577int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
578{
579 int type = dquot->dq_type;
580 struct super_block *sb = dquot->dq_sb;
581 loff_t offset;
582 dqbuf_t ddquot;
583 int ret = 0;
584
585#ifdef __QUOTA_QT_PARANOIA
586 /* Invalidated quota? */
587 if (!sb_dqopt(dquot->dq_sb)->files[type]) {
588 printk(KERN_ERR "VFS: Quota invalidated while reading!\n");
589 return -EIO;
590 }
591#endif
592 /* Do we know offset of the dquot entry in the quota file? */
593 if (!dquot->dq_off) {
594 offset = find_dqentry(info, dquot);
595 if (offset <= 0) { /* Entry not present? */
596 if (offset < 0)
597 printk(KERN_ERR "VFS: Can't read quota "
598 "structure for id %u.\n", dquot->dq_id);
599 dquot->dq_off = 0;
600 set_bit(DQ_FAKE_B, &dquot->dq_flags);
601 memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
602 ret = offset;
603 goto out;
604 }
605 dquot->dq_off = offset;
606 }
607 ddquot = getdqbuf(info->dqi_entry_size);
608 if (!ddquot)
609 return -ENOMEM;
610 ret = sb->s_op->quota_read(sb, type, (char *)ddquot,
611 info->dqi_entry_size, dquot->dq_off);
612 if (ret != info->dqi_entry_size) {
613 if (ret >= 0)
614 ret = -EIO;
615 printk(KERN_ERR "VFS: Error while reading quota "
616 "structure for id %u.\n", dquot->dq_id);
617 set_bit(DQ_FAKE_B, &dquot->dq_flags);
618 memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
619 freedqbuf(ddquot);
620 goto out;
621 }
622 spin_lock(&dq_data_lock);
623 info->dqi_ops->disk2mem_dqblk(dquot, ddquot);
624 if (!dquot->dq_dqb.dqb_bhardlimit &&
625 !dquot->dq_dqb.dqb_bsoftlimit &&
626 !dquot->dq_dqb.dqb_ihardlimit &&
627 !dquot->dq_dqb.dqb_isoftlimit)
628 set_bit(DQ_FAKE_B, &dquot->dq_flags);
629 spin_unlock(&dq_data_lock);
630 freedqbuf(ddquot);
631out:
632 dqstats.reads++;
633 return ret;
634}
635EXPORT_SYMBOL(qtree_read_dquot);
636
637/* Check whether dquot should not be deleted. We know we are
638 * the only one operating on dquot (thanks to dq_lock) */
639int qtree_release_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
640{
641 if (test_bit(DQ_FAKE_B, &dquot->dq_flags) && !(dquot->dq_dqb.dqb_curinodes | dquot->dq_dqb.dqb_curspace))
642 return qtree_delete_dquot(info, dquot);
643 return 0;
644}
645EXPORT_SYMBOL(qtree_release_dquot);
diff --git a/fs/quota_tree.h b/fs/quota_tree.h
new file mode 100644
index 000000000000..a1ab8db81a51
--- /dev/null
+++ b/fs/quota_tree.h
@@ -0,0 +1,25 @@
1/*
2 * Definitions of structures for vfsv0 quota format
3 */
4
5#ifndef _LINUX_QUOTA_TREE_H
6#define _LINUX_QUOTA_TREE_H
7
8#include <linux/types.h>
9#include <linux/quota.h>
10
11/*
12 * Structure of header of block with quota structures. It is padded to 16 bytes so
13 * there will be space for exactly 21 quota-entries in a block
14 */
15struct qt_disk_dqdbheader {
16 __le32 dqdh_next_free; /* Number of next block with free entry */
17 __le32 dqdh_prev_free; /* Number of previous block with free entry */
18 __le16 dqdh_entries; /* Number of valid entries in block */
19 __le16 dqdh_pad1;
20 __le32 dqdh_pad2;
21};
22
23#define QT_TREEOFF 1 /* Offset of tree in file in blocks */
24
25#endif /* _LINUX_QUOTAIO_TREE_H */
diff --git a/fs/quota_v1.c b/fs/quota_v1.c
index 5ae15b13eeb0..b4af1c69ad16 100644
--- a/fs/quota_v1.c
+++ b/fs/quota_v1.c
@@ -3,25 +3,39 @@
3#include <linux/quota.h> 3#include <linux/quota.h>
4#include <linux/quotaops.h> 4#include <linux/quotaops.h>
5#include <linux/dqblk_v1.h> 5#include <linux/dqblk_v1.h>
6#include <linux/quotaio_v1.h>
7#include <linux/kernel.h> 6#include <linux/kernel.h>
8#include <linux/init.h> 7#include <linux/init.h>
9#include <linux/module.h> 8#include <linux/module.h>
10 9
11#include <asm/byteorder.h> 10#include <asm/byteorder.h>
12 11
12#include "quotaio_v1.h"
13
13MODULE_AUTHOR("Jan Kara"); 14MODULE_AUTHOR("Jan Kara");
14MODULE_DESCRIPTION("Old quota format support"); 15MODULE_DESCRIPTION("Old quota format support");
15MODULE_LICENSE("GPL"); 16MODULE_LICENSE("GPL");
16 17
18#define QUOTABLOCK_BITS 10
19#define QUOTABLOCK_SIZE (1 << QUOTABLOCK_BITS)
20
21static inline qsize_t v1_stoqb(qsize_t space)
22{
23 return (space + QUOTABLOCK_SIZE - 1) >> QUOTABLOCK_BITS;
24}
25
26static inline qsize_t v1_qbtos(qsize_t blocks)
27{
28 return blocks << QUOTABLOCK_BITS;
29}
30
17static void v1_disk2mem_dqblk(struct mem_dqblk *m, struct v1_disk_dqblk *d) 31static void v1_disk2mem_dqblk(struct mem_dqblk *m, struct v1_disk_dqblk *d)
18{ 32{
19 m->dqb_ihardlimit = d->dqb_ihardlimit; 33 m->dqb_ihardlimit = d->dqb_ihardlimit;
20 m->dqb_isoftlimit = d->dqb_isoftlimit; 34 m->dqb_isoftlimit = d->dqb_isoftlimit;
21 m->dqb_curinodes = d->dqb_curinodes; 35 m->dqb_curinodes = d->dqb_curinodes;
22 m->dqb_bhardlimit = d->dqb_bhardlimit; 36 m->dqb_bhardlimit = v1_qbtos(d->dqb_bhardlimit);
23 m->dqb_bsoftlimit = d->dqb_bsoftlimit; 37 m->dqb_bsoftlimit = v1_qbtos(d->dqb_bsoftlimit);
24 m->dqb_curspace = ((qsize_t)d->dqb_curblocks) << QUOTABLOCK_BITS; 38 m->dqb_curspace = v1_qbtos(d->dqb_curblocks);
25 m->dqb_itime = d->dqb_itime; 39 m->dqb_itime = d->dqb_itime;
26 m->dqb_btime = d->dqb_btime; 40 m->dqb_btime = d->dqb_btime;
27} 41}
@@ -31,9 +45,9 @@ static void v1_mem2disk_dqblk(struct v1_disk_dqblk *d, struct mem_dqblk *m)
31 d->dqb_ihardlimit = m->dqb_ihardlimit; 45 d->dqb_ihardlimit = m->dqb_ihardlimit;
32 d->dqb_isoftlimit = m->dqb_isoftlimit; 46 d->dqb_isoftlimit = m->dqb_isoftlimit;
33 d->dqb_curinodes = m->dqb_curinodes; 47 d->dqb_curinodes = m->dqb_curinodes;
34 d->dqb_bhardlimit = m->dqb_bhardlimit; 48 d->dqb_bhardlimit = v1_stoqb(m->dqb_bhardlimit);
35 d->dqb_bsoftlimit = m->dqb_bsoftlimit; 49 d->dqb_bsoftlimit = v1_stoqb(m->dqb_bsoftlimit);
36 d->dqb_curblocks = toqb(m->dqb_curspace); 50 d->dqb_curblocks = v1_stoqb(m->dqb_curspace);
37 d->dqb_itime = m->dqb_itime; 51 d->dqb_itime = m->dqb_itime;
38 d->dqb_btime = m->dqb_btime; 52 d->dqb_btime = m->dqb_btime;
39} 53}
diff --git a/fs/quota_v2.c b/fs/quota_v2.c
index b53827dc02d9..b618b563635c 100644
--- a/fs/quota_v2.c
+++ b/fs/quota_v2.c
@@ -6,7 +6,6 @@
6#include <linux/fs.h> 6#include <linux/fs.h>
7#include <linux/mount.h> 7#include <linux/mount.h>
8#include <linux/dqblk_v2.h> 8#include <linux/dqblk_v2.h>
9#include <linux/quotaio_v2.h>
10#include <linux/kernel.h> 9#include <linux/kernel.h>
11#include <linux/init.h> 10#include <linux/init.h>
12#include <linux/module.h> 11#include <linux/module.h>
@@ -15,16 +14,37 @@
15 14
16#include <asm/byteorder.h> 15#include <asm/byteorder.h>
17 16
17#include "quota_tree.h"
18#include "quotaio_v2.h"
19
18MODULE_AUTHOR("Jan Kara"); 20MODULE_AUTHOR("Jan Kara");
19MODULE_DESCRIPTION("Quota format v2 support"); 21MODULE_DESCRIPTION("Quota format v2 support");
20MODULE_LICENSE("GPL"); 22MODULE_LICENSE("GPL");
21 23
22#define __QUOTA_V2_PARANOIA 24#define __QUOTA_V2_PARANOIA
23 25
24typedef char *dqbuf_t; 26static void v2_mem2diskdqb(void *dp, struct dquot *dquot);
27static void v2_disk2memdqb(struct dquot *dquot, void *dp);
28static int v2_is_id(void *dp, struct dquot *dquot);
29
30static struct qtree_fmt_operations v2_qtree_ops = {
31 .mem2disk_dqblk = v2_mem2diskdqb,
32 .disk2mem_dqblk = v2_disk2memdqb,
33 .is_id = v2_is_id,
34};
35
36#define QUOTABLOCK_BITS 10
37#define QUOTABLOCK_SIZE (1 << QUOTABLOCK_BITS)
25 38
26#define GETIDINDEX(id, depth) (((id) >> ((V2_DQTREEDEPTH-(depth)-1)*8)) & 0xff) 39static inline qsize_t v2_stoqb(qsize_t space)
27#define GETENTRIES(buf) ((struct v2_disk_dqblk *)(((char *)buf)+sizeof(struct v2_disk_dqdbheader))) 40{
41 return (space + QUOTABLOCK_SIZE - 1) >> QUOTABLOCK_BITS;
42}
43
44static inline qsize_t v2_qbtos(qsize_t blocks)
45{
46 return blocks << QUOTABLOCK_BITS;
47}
28 48
29/* Check whether given file is really vfsv0 quotafile */ 49/* Check whether given file is really vfsv0 quotafile */
30static int v2_check_quota_file(struct super_block *sb, int type) 50static int v2_check_quota_file(struct super_block *sb, int type)
@@ -50,7 +70,8 @@ static int v2_check_quota_file(struct super_block *sb, int type)
50static int v2_read_file_info(struct super_block *sb, int type) 70static int v2_read_file_info(struct super_block *sb, int type)
51{ 71{
52 struct v2_disk_dqinfo dinfo; 72 struct v2_disk_dqinfo dinfo;
53 struct mem_dqinfo *info = sb_dqopt(sb)->info+type; 73 struct mem_dqinfo *info = sb_dqinfo(sb, type);
74 struct qtree_mem_dqinfo *qinfo;
54 ssize_t size; 75 ssize_t size;
55 76
56 size = sb->s_op->quota_read(sb, type, (char *)&dinfo, 77 size = sb->s_op->quota_read(sb, type, (char *)&dinfo,
@@ -60,15 +81,29 @@ static int v2_read_file_info(struct super_block *sb, int type)
60 sb->s_id); 81 sb->s_id);
61 return -1; 82 return -1;
62 } 83 }
84 info->dqi_priv = kmalloc(sizeof(struct qtree_mem_dqinfo), GFP_NOFS);
85 if (!info->dqi_priv) {
86 printk(KERN_WARNING
87 "Not enough memory for quota information structure.\n");
88 return -1;
89 }
90 qinfo = info->dqi_priv;
63 /* limits are stored as unsigned 32-bit data */ 91 /* limits are stored as unsigned 32-bit data */
64 info->dqi_maxblimit = 0xffffffff; 92 info->dqi_maxblimit = 0xffffffff;
65 info->dqi_maxilimit = 0xffffffff; 93 info->dqi_maxilimit = 0xffffffff;
66 info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace); 94 info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace);
67 info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace); 95 info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace);
68 info->dqi_flags = le32_to_cpu(dinfo.dqi_flags); 96 info->dqi_flags = le32_to_cpu(dinfo.dqi_flags);
69 info->u.v2_i.dqi_blocks = le32_to_cpu(dinfo.dqi_blocks); 97 qinfo->dqi_sb = sb;
70 info->u.v2_i.dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk); 98 qinfo->dqi_type = type;
71 info->u.v2_i.dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry); 99 qinfo->dqi_blocks = le32_to_cpu(dinfo.dqi_blocks);
100 qinfo->dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk);
101 qinfo->dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry);
102 qinfo->dqi_blocksize_bits = V2_DQBLKSIZE_BITS;
103 qinfo->dqi_usable_bs = 1 << V2_DQBLKSIZE_BITS;
104 qinfo->dqi_qtree_depth = qtree_depth(qinfo);
105 qinfo->dqi_entry_size = sizeof(struct v2_disk_dqblk);
106 qinfo->dqi_ops = &v2_qtree_ops;
72 return 0; 107 return 0;
73} 108}
74 109
@@ -76,7 +111,8 @@ static int v2_read_file_info(struct super_block *sb, int type)
76static int v2_write_file_info(struct super_block *sb, int type) 111static int v2_write_file_info(struct super_block *sb, int type)
77{ 112{
78 struct v2_disk_dqinfo dinfo; 113 struct v2_disk_dqinfo dinfo;
79 struct mem_dqinfo *info = sb_dqopt(sb)->info+type; 114 struct mem_dqinfo *info = sb_dqinfo(sb, type);
115 struct qtree_mem_dqinfo *qinfo = info->dqi_priv;
80 ssize_t size; 116 ssize_t size;
81 117
82 spin_lock(&dq_data_lock); 118 spin_lock(&dq_data_lock);
@@ -85,9 +121,9 @@ static int v2_write_file_info(struct super_block *sb, int type)
85 dinfo.dqi_igrace = cpu_to_le32(info->dqi_igrace); 121 dinfo.dqi_igrace = cpu_to_le32(info->dqi_igrace);
86 dinfo.dqi_flags = cpu_to_le32(info->dqi_flags & DQF_MASK); 122 dinfo.dqi_flags = cpu_to_le32(info->dqi_flags & DQF_MASK);
87 spin_unlock(&dq_data_lock); 123 spin_unlock(&dq_data_lock);
88 dinfo.dqi_blocks = cpu_to_le32(info->u.v2_i.dqi_blocks); 124 dinfo.dqi_blocks = cpu_to_le32(qinfo->dqi_blocks);
89 dinfo.dqi_free_blk = cpu_to_le32(info->u.v2_i.dqi_free_blk); 125 dinfo.dqi_free_blk = cpu_to_le32(qinfo->dqi_free_blk);
90 dinfo.dqi_free_entry = cpu_to_le32(info->u.v2_i.dqi_free_entry); 126 dinfo.dqi_free_entry = cpu_to_le32(qinfo->dqi_free_entry);
91 size = sb->s_op->quota_write(sb, type, (char *)&dinfo, 127 size = sb->s_op->quota_write(sb, type, (char *)&dinfo,
92 sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF); 128 sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF);
93 if (size != sizeof(struct v2_disk_dqinfo)) { 129 if (size != sizeof(struct v2_disk_dqinfo)) {
@@ -98,574 +134,75 @@ static int v2_write_file_info(struct super_block *sb, int type)
98 return 0; 134 return 0;
99} 135}
100 136
101static void disk2memdqb(struct mem_dqblk *m, struct v2_disk_dqblk *d) 137static void v2_disk2memdqb(struct dquot *dquot, void *dp)
102{ 138{
139 struct v2_disk_dqblk *d = dp, empty;
140 struct mem_dqblk *m = &dquot->dq_dqb;
141
103 m->dqb_ihardlimit = le32_to_cpu(d->dqb_ihardlimit); 142 m->dqb_ihardlimit = le32_to_cpu(d->dqb_ihardlimit);
104 m->dqb_isoftlimit = le32_to_cpu(d->dqb_isoftlimit); 143 m->dqb_isoftlimit = le32_to_cpu(d->dqb_isoftlimit);
105 m->dqb_curinodes = le32_to_cpu(d->dqb_curinodes); 144 m->dqb_curinodes = le32_to_cpu(d->dqb_curinodes);
106 m->dqb_itime = le64_to_cpu(d->dqb_itime); 145 m->dqb_itime = le64_to_cpu(d->dqb_itime);
107 m->dqb_bhardlimit = le32_to_cpu(d->dqb_bhardlimit); 146 m->dqb_bhardlimit = v2_qbtos(le32_to_cpu(d->dqb_bhardlimit));
108 m->dqb_bsoftlimit = le32_to_cpu(d->dqb_bsoftlimit); 147 m->dqb_bsoftlimit = v2_qbtos(le32_to_cpu(d->dqb_bsoftlimit));
109 m->dqb_curspace = le64_to_cpu(d->dqb_curspace); 148 m->dqb_curspace = le64_to_cpu(d->dqb_curspace);
110 m->dqb_btime = le64_to_cpu(d->dqb_btime); 149 m->dqb_btime = le64_to_cpu(d->dqb_btime);
150 /* We need to escape back all-zero structure */
151 memset(&empty, 0, sizeof(struct v2_disk_dqblk));
152 empty.dqb_itime = cpu_to_le64(1);
153 if (!memcmp(&empty, dp, sizeof(struct v2_disk_dqblk)))
154 m->dqb_itime = 0;
111} 155}
112 156
113static void mem2diskdqb(struct v2_disk_dqblk *d, struct mem_dqblk *m, qid_t id) 157static void v2_mem2diskdqb(void *dp, struct dquot *dquot)
114{ 158{
159 struct v2_disk_dqblk *d = dp;
160 struct mem_dqblk *m = &dquot->dq_dqb;
161 struct qtree_mem_dqinfo *info =
162 sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
163
115 d->dqb_ihardlimit = cpu_to_le32(m->dqb_ihardlimit); 164 d->dqb_ihardlimit = cpu_to_le32(m->dqb_ihardlimit);
116 d->dqb_isoftlimit = cpu_to_le32(m->dqb_isoftlimit); 165 d->dqb_isoftlimit = cpu_to_le32(m->dqb_isoftlimit);
117 d->dqb_curinodes = cpu_to_le32(m->dqb_curinodes); 166 d->dqb_curinodes = cpu_to_le32(m->dqb_curinodes);
118 d->dqb_itime = cpu_to_le64(m->dqb_itime); 167 d->dqb_itime = cpu_to_le64(m->dqb_itime);
119 d->dqb_bhardlimit = cpu_to_le32(m->dqb_bhardlimit); 168 d->dqb_bhardlimit = cpu_to_le32(v2_stoqb(m->dqb_bhardlimit));
120 d->dqb_bsoftlimit = cpu_to_le32(m->dqb_bsoftlimit); 169 d->dqb_bsoftlimit = cpu_to_le32(v2_stoqb(m->dqb_bsoftlimit));
121 d->dqb_curspace = cpu_to_le64(m->dqb_curspace); 170 d->dqb_curspace = cpu_to_le64(m->dqb_curspace);
122 d->dqb_btime = cpu_to_le64(m->dqb_btime); 171 d->dqb_btime = cpu_to_le64(m->dqb_btime);
123 d->dqb_id = cpu_to_le32(id); 172 d->dqb_id = cpu_to_le32(dquot->dq_id);
124} 173 if (qtree_entry_unused(info, dp))
125 174 d->dqb_itime = cpu_to_le64(1);
126static dqbuf_t getdqbuf(void)
127{
128 dqbuf_t buf = kmalloc(V2_DQBLKSIZE, GFP_NOFS);
129 if (!buf)
130 printk(KERN_WARNING "VFS: Not enough memory for quota buffers.\n");
131 return buf;
132}
133
134static inline void freedqbuf(dqbuf_t buf)
135{
136 kfree(buf);
137}
138
139static inline ssize_t read_blk(struct super_block *sb, int type, uint blk, dqbuf_t buf)
140{
141 memset(buf, 0, V2_DQBLKSIZE);
142 return sb->s_op->quota_read(sb, type, (char *)buf,
143 V2_DQBLKSIZE, blk << V2_DQBLKSIZE_BITS);
144}
145
146static inline ssize_t write_blk(struct super_block *sb, int type, uint blk, dqbuf_t buf)
147{
148 return sb->s_op->quota_write(sb, type, (char *)buf,
149 V2_DQBLKSIZE, blk << V2_DQBLKSIZE_BITS);
150}
151
152/* Remove empty block from list and return it */
153static int get_free_dqblk(struct super_block *sb, int type)
154{
155 dqbuf_t buf = getdqbuf();
156 struct mem_dqinfo *info = sb_dqinfo(sb, type);
157 struct v2_disk_dqdbheader *dh = (struct v2_disk_dqdbheader *)buf;
158 int ret, blk;
159
160 if (!buf)
161 return -ENOMEM;
162 if (info->u.v2_i.dqi_free_blk) {
163 blk = info->u.v2_i.dqi_free_blk;
164 if ((ret = read_blk(sb, type, blk, buf)) < 0)
165 goto out_buf;
166 info->u.v2_i.dqi_free_blk = le32_to_cpu(dh->dqdh_next_free);
167 }
168 else {
169 memset(buf, 0, V2_DQBLKSIZE);
170 /* Assure block allocation... */
171 if ((ret = write_blk(sb, type, info->u.v2_i.dqi_blocks, buf)) < 0)
172 goto out_buf;
173 blk = info->u.v2_i.dqi_blocks++;
174 }
175 mark_info_dirty(sb, type);
176 ret = blk;
177out_buf:
178 freedqbuf(buf);
179 return ret;
180}
181
182/* Insert empty block to the list */
183static int put_free_dqblk(struct super_block *sb, int type, dqbuf_t buf, uint blk)
184{
185 struct mem_dqinfo *info = sb_dqinfo(sb, type);
186 struct v2_disk_dqdbheader *dh = (struct v2_disk_dqdbheader *)buf;
187 int err;
188
189 dh->dqdh_next_free = cpu_to_le32(info->u.v2_i.dqi_free_blk);
190 dh->dqdh_prev_free = cpu_to_le32(0);
191 dh->dqdh_entries = cpu_to_le16(0);
192 info->u.v2_i.dqi_free_blk = blk;
193 mark_info_dirty(sb, type);
194 /* Some strange block. We had better leave it... */
195 if ((err = write_blk(sb, type, blk, buf)) < 0)
196 return err;
197 return 0;
198} 175}
199 176
200/* Remove given block from the list of blocks with free entries */ 177static int v2_is_id(void *dp, struct dquot *dquot)
201static int remove_free_dqentry(struct super_block *sb, int type, dqbuf_t buf, uint blk)
202{ 178{
203 dqbuf_t tmpbuf = getdqbuf(); 179 struct v2_disk_dqblk *d = dp;
204 struct mem_dqinfo *info = sb_dqinfo(sb, type); 180 struct qtree_mem_dqinfo *info =
205 struct v2_disk_dqdbheader *dh = (struct v2_disk_dqdbheader *)buf; 181 sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
206 uint nextblk = le32_to_cpu(dh->dqdh_next_free), prevblk = le32_to_cpu(dh->dqdh_prev_free);
207 int err;
208 182
209 if (!tmpbuf) 183 if (qtree_entry_unused(info, dp))
210 return -ENOMEM;
211 if (nextblk) {
212 if ((err = read_blk(sb, type, nextblk, tmpbuf)) < 0)
213 goto out_buf;
214 ((struct v2_disk_dqdbheader *)tmpbuf)->dqdh_prev_free = dh->dqdh_prev_free;
215 if ((err = write_blk(sb, type, nextblk, tmpbuf)) < 0)
216 goto out_buf;
217 }
218 if (prevblk) {
219 if ((err = read_blk(sb, type, prevblk, tmpbuf)) < 0)
220 goto out_buf;
221 ((struct v2_disk_dqdbheader *)tmpbuf)->dqdh_next_free = dh->dqdh_next_free;
222 if ((err = write_blk(sb, type, prevblk, tmpbuf)) < 0)
223 goto out_buf;
224 }
225 else {
226 info->u.v2_i.dqi_free_entry = nextblk;
227 mark_info_dirty(sb, type);
228 }
229 freedqbuf(tmpbuf);
230 dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0);
231 /* No matter whether write succeeds block is out of list */
232 if (write_blk(sb, type, blk, buf) < 0)
233 printk(KERN_ERR "VFS: Can't write block (%u) with free entries.\n", blk);
234 return 0;
235out_buf:
236 freedqbuf(tmpbuf);
237 return err;
238}
239
240/* Insert given block to the beginning of list with free entries */
241static int insert_free_dqentry(struct super_block *sb, int type, dqbuf_t buf, uint blk)
242{
243 dqbuf_t tmpbuf = getdqbuf();
244 struct mem_dqinfo *info = sb_dqinfo(sb, type);
245 struct v2_disk_dqdbheader *dh = (struct v2_disk_dqdbheader *)buf;
246 int err;
247
248 if (!tmpbuf)
249 return -ENOMEM;
250 dh->dqdh_next_free = cpu_to_le32(info->u.v2_i.dqi_free_entry);
251 dh->dqdh_prev_free = cpu_to_le32(0);
252 if ((err = write_blk(sb, type, blk, buf)) < 0)
253 goto out_buf;
254 if (info->u.v2_i.dqi_free_entry) {
255 if ((err = read_blk(sb, type, info->u.v2_i.dqi_free_entry, tmpbuf)) < 0)
256 goto out_buf;
257 ((struct v2_disk_dqdbheader *)tmpbuf)->dqdh_prev_free = cpu_to_le32(blk);
258 if ((err = write_blk(sb, type, info->u.v2_i.dqi_free_entry, tmpbuf)) < 0)
259 goto out_buf;
260 }
261 freedqbuf(tmpbuf);
262 info->u.v2_i.dqi_free_entry = blk;
263 mark_info_dirty(sb, type);
264 return 0;
265out_buf:
266 freedqbuf(tmpbuf);
267 return err;
268}
269
270/* Find space for dquot */
271static uint find_free_dqentry(struct dquot *dquot, int *err)
272{
273 struct super_block *sb = dquot->dq_sb;
274 struct mem_dqinfo *info = sb_dqopt(sb)->info+dquot->dq_type;
275 uint blk, i;
276 struct v2_disk_dqdbheader *dh;
277 struct v2_disk_dqblk *ddquot;
278 struct v2_disk_dqblk fakedquot;
279 dqbuf_t buf;
280
281 *err = 0;
282 if (!(buf = getdqbuf())) {
283 *err = -ENOMEM;
284 return 0; 184 return 0;
285 } 185 return le32_to_cpu(d->dqb_id) == dquot->dq_id;
286 dh = (struct v2_disk_dqdbheader *)buf;
287 ddquot = GETENTRIES(buf);
288 if (info->u.v2_i.dqi_free_entry) {
289 blk = info->u.v2_i.dqi_free_entry;
290 if ((*err = read_blk(sb, dquot->dq_type, blk, buf)) < 0)
291 goto out_buf;
292 }
293 else {
294 blk = get_free_dqblk(sb, dquot->dq_type);
295 if ((int)blk < 0) {
296 *err = blk;
297 freedqbuf(buf);
298 return 0;
299 }
300 memset(buf, 0, V2_DQBLKSIZE);
301 /* This is enough as block is already zeroed and entry list is empty... */
302 info->u.v2_i.dqi_free_entry = blk;
303 mark_info_dirty(sb, dquot->dq_type);
304 }
305 if (le16_to_cpu(dh->dqdh_entries)+1 >= V2_DQSTRINBLK) /* Block will be full? */
306 if ((*err = remove_free_dqentry(sb, dquot->dq_type, buf, blk)) < 0) {
307 printk(KERN_ERR "VFS: find_free_dqentry(): Can't remove block (%u) from entry free list.\n", blk);
308 goto out_buf;
309 }
310 le16_add_cpu(&dh->dqdh_entries, 1);
311 memset(&fakedquot, 0, sizeof(struct v2_disk_dqblk));
312 /* Find free structure in block */
313 for (i = 0; i < V2_DQSTRINBLK && memcmp(&fakedquot, ddquot+i, sizeof(struct v2_disk_dqblk)); i++);
314#ifdef __QUOTA_V2_PARANOIA
315 if (i == V2_DQSTRINBLK) {
316 printk(KERN_ERR "VFS: find_free_dqentry(): Data block full but it shouldn't.\n");
317 *err = -EIO;
318 goto out_buf;
319 }
320#endif
321 if ((*err = write_blk(sb, dquot->dq_type, blk, buf)) < 0) {
322 printk(KERN_ERR "VFS: find_free_dqentry(): Can't write quota data block %u.\n", blk);
323 goto out_buf;
324 }
325 dquot->dq_off = (blk<<V2_DQBLKSIZE_BITS)+sizeof(struct v2_disk_dqdbheader)+i*sizeof(struct v2_disk_dqblk);
326 freedqbuf(buf);
327 return blk;
328out_buf:
329 freedqbuf(buf);
330 return 0;
331}
332
333/* Insert reference to structure into the trie */
334static int do_insert_tree(struct dquot *dquot, uint *treeblk, int depth)
335{
336 struct super_block *sb = dquot->dq_sb;
337 dqbuf_t buf;
338 int ret = 0, newson = 0, newact = 0;
339 __le32 *ref;
340 uint newblk;
341
342 if (!(buf = getdqbuf()))
343 return -ENOMEM;
344 if (!*treeblk) {
345 ret = get_free_dqblk(sb, dquot->dq_type);
346 if (ret < 0)
347 goto out_buf;
348 *treeblk = ret;
349 memset(buf, 0, V2_DQBLKSIZE);
350 newact = 1;
351 }
352 else {
353 if ((ret = read_blk(sb, dquot->dq_type, *treeblk, buf)) < 0) {
354 printk(KERN_ERR "VFS: Can't read tree quota block %u.\n", *treeblk);
355 goto out_buf;
356 }
357 }
358 ref = (__le32 *)buf;
359 newblk = le32_to_cpu(ref[GETIDINDEX(dquot->dq_id, depth)]);
360 if (!newblk)
361 newson = 1;
362 if (depth == V2_DQTREEDEPTH-1) {
363#ifdef __QUOTA_V2_PARANOIA
364 if (newblk) {
365 printk(KERN_ERR "VFS: Inserting already present quota entry (block %u).\n", le32_to_cpu(ref[GETIDINDEX(dquot->dq_id, depth)]));
366 ret = -EIO;
367 goto out_buf;
368 }
369#endif
370 newblk = find_free_dqentry(dquot, &ret);
371 }
372 else
373 ret = do_insert_tree(dquot, &newblk, depth+1);
374 if (newson && ret >= 0) {
375 ref[GETIDINDEX(dquot->dq_id, depth)] = cpu_to_le32(newblk);
376 ret = write_blk(sb, dquot->dq_type, *treeblk, buf);
377 }
378 else if (newact && ret < 0)
379 put_free_dqblk(sb, dquot->dq_type, buf, *treeblk);
380out_buf:
381 freedqbuf(buf);
382 return ret;
383} 186}
384 187
385/* Wrapper for inserting quota structure into tree */ 188static int v2_read_dquot(struct dquot *dquot)
386static inline int dq_insert_tree(struct dquot *dquot)
387{ 189{
388 int tmp = V2_DQTREEOFF; 190 return qtree_read_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv, dquot);
389 return do_insert_tree(dquot, &tmp, 0);
390} 191}
391 192
392/*
393 * We don't have to be afraid of deadlocks as we never have quotas on quota files...
394 */
395static int v2_write_dquot(struct dquot *dquot) 193static int v2_write_dquot(struct dquot *dquot)
396{ 194{
397 int type = dquot->dq_type; 195 return qtree_write_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv, dquot);
398 ssize_t ret;
399 struct v2_disk_dqblk ddquot, empty;
400
401 /* dq_off is guarded by dqio_mutex */
402 if (!dquot->dq_off)
403 if ((ret = dq_insert_tree(dquot)) < 0) {
404 printk(KERN_ERR "VFS: Error %zd occurred while creating quota.\n", ret);
405 return ret;
406 }
407 spin_lock(&dq_data_lock);
408 mem2diskdqb(&ddquot, &dquot->dq_dqb, dquot->dq_id);
409 /* Argh... We may need to write structure full of zeroes but that would be
410 * treated as an empty place by the rest of the code. Format change would
411 * be definitely cleaner but the problems probably are not worth it */
412 memset(&empty, 0, sizeof(struct v2_disk_dqblk));
413 if (!memcmp(&empty, &ddquot, sizeof(struct v2_disk_dqblk)))
414 ddquot.dqb_itime = cpu_to_le64(1);
415 spin_unlock(&dq_data_lock);
416 ret = dquot->dq_sb->s_op->quota_write(dquot->dq_sb, type,
417 (char *)&ddquot, sizeof(struct v2_disk_dqblk), dquot->dq_off);
418 if (ret != sizeof(struct v2_disk_dqblk)) {
419 printk(KERN_WARNING "VFS: dquota write failed on dev %s\n", dquot->dq_sb->s_id);
420 if (ret >= 0)
421 ret = -ENOSPC;
422 }
423 else
424 ret = 0;
425 dqstats.writes++;
426
427 return ret;
428} 196}
429 197
430/* Free dquot entry in data block */ 198static int v2_release_dquot(struct dquot *dquot)
431static int free_dqentry(struct dquot *dquot, uint blk)
432{
433 struct super_block *sb = dquot->dq_sb;
434 int type = dquot->dq_type;
435 struct v2_disk_dqdbheader *dh;
436 dqbuf_t buf = getdqbuf();
437 int ret = 0;
438
439 if (!buf)
440 return -ENOMEM;
441 if (dquot->dq_off >> V2_DQBLKSIZE_BITS != blk) {
442 printk(KERN_ERR "VFS: Quota structure has offset to other "
443 "block (%u) than it should (%u).\n", blk,
444 (uint)(dquot->dq_off >> V2_DQBLKSIZE_BITS));
445 goto out_buf;
446 }
447 if ((ret = read_blk(sb, type, blk, buf)) < 0) {
448 printk(KERN_ERR "VFS: Can't read quota data block %u\n", blk);
449 goto out_buf;
450 }
451 dh = (struct v2_disk_dqdbheader *)buf;
452 le16_add_cpu(&dh->dqdh_entries, -1);
453 if (!le16_to_cpu(dh->dqdh_entries)) { /* Block got free? */
454 if ((ret = remove_free_dqentry(sb, type, buf, blk)) < 0 ||
455 (ret = put_free_dqblk(sb, type, buf, blk)) < 0) {
456 printk(KERN_ERR "VFS: Can't move quota data block (%u) "
457 "to free list.\n", blk);
458 goto out_buf;
459 }
460 }
461 else {
462 memset(buf+(dquot->dq_off & ((1 << V2_DQBLKSIZE_BITS)-1)), 0,
463 sizeof(struct v2_disk_dqblk));
464 if (le16_to_cpu(dh->dqdh_entries) == V2_DQSTRINBLK-1) {
465 /* Insert will write block itself */
466 if ((ret = insert_free_dqentry(sb, type, buf, blk)) < 0) {
467 printk(KERN_ERR "VFS: Can't insert quota data block (%u) to free entry list.\n", blk);
468 goto out_buf;
469 }
470 }
471 else
472 if ((ret = write_blk(sb, type, blk, buf)) < 0) {
473 printk(KERN_ERR "VFS: Can't write quota data "
474 "block %u\n", blk);
475 goto out_buf;
476 }
477 }
478 dquot->dq_off = 0; /* Quota is now unattached */
479out_buf:
480 freedqbuf(buf);
481 return ret;
482}
483
484/* Remove reference to dquot from tree */
485static int remove_tree(struct dquot *dquot, uint *blk, int depth)
486{
487 struct super_block *sb = dquot->dq_sb;
488 int type = dquot->dq_type;
489 dqbuf_t buf = getdqbuf();
490 int ret = 0;
491 uint newblk;
492 __le32 *ref = (__le32 *)buf;
493
494 if (!buf)
495 return -ENOMEM;
496 if ((ret = read_blk(sb, type, *blk, buf)) < 0) {
497 printk(KERN_ERR "VFS: Can't read quota data block %u\n", *blk);
498 goto out_buf;
499 }
500 newblk = le32_to_cpu(ref[GETIDINDEX(dquot->dq_id, depth)]);
501 if (depth == V2_DQTREEDEPTH-1) {
502 ret = free_dqentry(dquot, newblk);
503 newblk = 0;
504 }
505 else
506 ret = remove_tree(dquot, &newblk, depth+1);
507 if (ret >= 0 && !newblk) {
508 int i;
509 ref[GETIDINDEX(dquot->dq_id, depth)] = cpu_to_le32(0);
510 for (i = 0; i < V2_DQBLKSIZE && !buf[i]; i++); /* Block got empty? */
511 /* Don't put the root block into the free block list */
512 if (i == V2_DQBLKSIZE && *blk != V2_DQTREEOFF) {
513 put_free_dqblk(sb, type, buf, *blk);
514 *blk = 0;
515 }
516 else
517 if ((ret = write_blk(sb, type, *blk, buf)) < 0)
518 printk(KERN_ERR "VFS: Can't write quota tree "
519 "block %u.\n", *blk);
520 }
521out_buf:
522 freedqbuf(buf);
523 return ret;
524}
525
526/* Delete dquot from tree */
527static int v2_delete_dquot(struct dquot *dquot)
528{
529 uint tmp = V2_DQTREEOFF;
530
531 if (!dquot->dq_off) /* Even not allocated? */
532 return 0;
533 return remove_tree(dquot, &tmp, 0);
534}
535
536/* Find entry in block */
537static loff_t find_block_dqentry(struct dquot *dquot, uint blk)
538{
539 dqbuf_t buf = getdqbuf();
540 loff_t ret = 0;
541 int i;
542 struct v2_disk_dqblk *ddquot = GETENTRIES(buf);
543
544 if (!buf)
545 return -ENOMEM;
546 if ((ret = read_blk(dquot->dq_sb, dquot->dq_type, blk, buf)) < 0) {
547 printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
548 goto out_buf;
549 }
550 if (dquot->dq_id)
551 for (i = 0; i < V2_DQSTRINBLK &&
552 le32_to_cpu(ddquot[i].dqb_id) != dquot->dq_id; i++);
553 else { /* ID 0 as a bit more complicated searching... */
554 struct v2_disk_dqblk fakedquot;
555
556 memset(&fakedquot, 0, sizeof(struct v2_disk_dqblk));
557 for (i = 0; i < V2_DQSTRINBLK; i++)
558 if (!le32_to_cpu(ddquot[i].dqb_id) &&
559 memcmp(&fakedquot, ddquot+i, sizeof(struct v2_disk_dqblk)))
560 break;
561 }
562 if (i == V2_DQSTRINBLK) {
563 printk(KERN_ERR "VFS: Quota for id %u referenced "
564 "but not present.\n", dquot->dq_id);
565 ret = -EIO;
566 goto out_buf;
567 }
568 else
569 ret = (blk << V2_DQBLKSIZE_BITS) + sizeof(struct
570 v2_disk_dqdbheader) + i * sizeof(struct v2_disk_dqblk);
571out_buf:
572 freedqbuf(buf);
573 return ret;
574}
575
576/* Find entry for given id in the tree */
577static loff_t find_tree_dqentry(struct dquot *dquot, uint blk, int depth)
578{
579 dqbuf_t buf = getdqbuf();
580 loff_t ret = 0;
581 __le32 *ref = (__le32 *)buf;
582
583 if (!buf)
584 return -ENOMEM;
585 if ((ret = read_blk(dquot->dq_sb, dquot->dq_type, blk, buf)) < 0) {
586 printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
587 goto out_buf;
588 }
589 ret = 0;
590 blk = le32_to_cpu(ref[GETIDINDEX(dquot->dq_id, depth)]);
591 if (!blk) /* No reference? */
592 goto out_buf;
593 if (depth < V2_DQTREEDEPTH-1)
594 ret = find_tree_dqentry(dquot, blk, depth+1);
595 else
596 ret = find_block_dqentry(dquot, blk);
597out_buf:
598 freedqbuf(buf);
599 return ret;
600}
601
602/* Find entry for given id in the tree - wrapper function */
603static inline loff_t find_dqentry(struct dquot *dquot)
604{
605 return find_tree_dqentry(dquot, V2_DQTREEOFF, 0);
606}
607
608static int v2_read_dquot(struct dquot *dquot)
609{ 199{
610 int type = dquot->dq_type; 200 return qtree_release_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv, dquot);
611 loff_t offset;
612 struct v2_disk_dqblk ddquot, empty;
613 int ret = 0;
614
615#ifdef __QUOTA_V2_PARANOIA
616 /* Invalidated quota? */
617 if (!dquot->dq_sb || !sb_dqopt(dquot->dq_sb)->files[type]) {
618 printk(KERN_ERR "VFS: Quota invalidated while reading!\n");
619 return -EIO;
620 }
621#endif
622 offset = find_dqentry(dquot);
623 if (offset <= 0) { /* Entry not present? */
624 if (offset < 0)
625 printk(KERN_ERR "VFS: Can't read quota "
626 "structure for id %u.\n", dquot->dq_id);
627 dquot->dq_off = 0;
628 set_bit(DQ_FAKE_B, &dquot->dq_flags);
629 memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
630 ret = offset;
631 }
632 else {
633 dquot->dq_off = offset;
634 if ((ret = dquot->dq_sb->s_op->quota_read(dquot->dq_sb, type,
635 (char *)&ddquot, sizeof(struct v2_disk_dqblk), offset))
636 != sizeof(struct v2_disk_dqblk)) {
637 if (ret >= 0)
638 ret = -EIO;
639 printk(KERN_ERR "VFS: Error while reading quota "
640 "structure for id %u.\n", dquot->dq_id);
641 memset(&ddquot, 0, sizeof(struct v2_disk_dqblk));
642 }
643 else {
644 ret = 0;
645 /* We need to escape back all-zero structure */
646 memset(&empty, 0, sizeof(struct v2_disk_dqblk));
647 empty.dqb_itime = cpu_to_le64(1);
648 if (!memcmp(&empty, &ddquot, sizeof(struct v2_disk_dqblk)))
649 ddquot.dqb_itime = 0;
650 }
651 disk2memdqb(&dquot->dq_dqb, &ddquot);
652 if (!dquot->dq_dqb.dqb_bhardlimit &&
653 !dquot->dq_dqb.dqb_bsoftlimit &&
654 !dquot->dq_dqb.dqb_ihardlimit &&
655 !dquot->dq_dqb.dqb_isoftlimit)
656 set_bit(DQ_FAKE_B, &dquot->dq_flags);
657 }
658 dqstats.reads++;
659
660 return ret;
661} 201}
662 202
663/* Check whether dquot should not be deleted. We know we are 203static int v2_free_file_info(struct super_block *sb, int type)
664 * the only one operating on dquot (thanks to dq_lock) */
665static int v2_release_dquot(struct dquot *dquot)
666{ 204{
667 if (test_bit(DQ_FAKE_B, &dquot->dq_flags) && !(dquot->dq_dqb.dqb_curinodes | dquot->dq_dqb.dqb_curspace)) 205 kfree(sb_dqinfo(sb, type)->dqi_priv);
668 return v2_delete_dquot(dquot);
669 return 0; 206 return 0;
670} 207}
671 208
@@ -673,7 +210,7 @@ static struct quota_format_ops v2_format_ops = {
673 .check_quota_file = v2_check_quota_file, 210 .check_quota_file = v2_check_quota_file,
674 .read_file_info = v2_read_file_info, 211 .read_file_info = v2_read_file_info,
675 .write_file_info = v2_write_file_info, 212 .write_file_info = v2_write_file_info,
676 .free_file_info = NULL, 213 .free_file_info = v2_free_file_info,
677 .read_dqblk = v2_read_dquot, 214 .read_dqblk = v2_read_dquot,
678 .commit_dqblk = v2_write_dquot, 215 .commit_dqblk = v2_write_dquot,
679 .release_dqblk = v2_release_dquot, 216 .release_dqblk = v2_release_dquot,
diff --git a/fs/quotaio_v1.h b/fs/quotaio_v1.h
new file mode 100644
index 000000000000..746654b5de70
--- /dev/null
+++ b/fs/quotaio_v1.h
@@ -0,0 +1,33 @@
1#ifndef _LINUX_QUOTAIO_V1_H
2#define _LINUX_QUOTAIO_V1_H
3
4#include <linux/types.h>
5
6/*
7 * The following constants define the amount of time given a user
8 * before the soft limits are treated as hard limits (usually resulting
9 * in an allocation failure). The timer is started when the user crosses
10 * their soft limit, it is reset when they go below their soft limit.
11 */
12#define MAX_IQ_TIME 604800 /* (7*24*60*60) 1 week */
13#define MAX_DQ_TIME 604800 /* (7*24*60*60) 1 week */
14
15/*
16 * The following structure defines the format of the disk quota file
17 * (as it appears on disk) - the file is an array of these structures
18 * indexed by user or group number.
19 */
20struct v1_disk_dqblk {
21 __u32 dqb_bhardlimit; /* absolute limit on disk blks alloc */
22 __u32 dqb_bsoftlimit; /* preferred limit on disk blks */
23 __u32 dqb_curblocks; /* current block count */
24 __u32 dqb_ihardlimit; /* absolute limit on allocated inodes */
25 __u32 dqb_isoftlimit; /* preferred inode limit */
26 __u32 dqb_curinodes; /* current # allocated inodes */
27 time_t dqb_btime; /* time limit for excessive disk use */
28 time_t dqb_itime; /* time limit for excessive inode use */
29};
30
31#define v1_dqoff(UID) ((loff_t)((UID) * sizeof (struct v1_disk_dqblk)))
32
33#endif /* _LINUX_QUOTAIO_V1_H */
diff --git a/fs/quotaio_v2.h b/fs/quotaio_v2.h
new file mode 100644
index 000000000000..530fe580685c
--- /dev/null
+++ b/fs/quotaio_v2.h
@@ -0,0 +1,60 @@
1/*
2 * Definitions of structures for vfsv0 quota format
3 */
4
5#ifndef _LINUX_QUOTAIO_V2_H
6#define _LINUX_QUOTAIO_V2_H
7
8#include <linux/types.h>
9#include <linux/quota.h>
10
11/*
12 * Definitions of magics and versions of current quota files
13 */
14#define V2_INITQMAGICS {\
15 0xd9c01f11, /* USRQUOTA */\
16 0xd9c01927 /* GRPQUOTA */\
17}
18
19#define V2_INITQVERSIONS {\
20 0, /* USRQUOTA */\
21 0 /* GRPQUOTA */\
22}
23
24/* First generic header */
25struct v2_disk_dqheader {
26 __le32 dqh_magic; /* Magic number identifying file */
27 __le32 dqh_version; /* File version */
28};
29
30/*
31 * The following structure defines the format of the disk quota file
32 * (as it appears on disk) - the file is a radix tree whose leaves point
33 * to blocks of these structures.
34 */
35struct v2_disk_dqblk {
36 __le32 dqb_id; /* id this quota applies to */
37 __le32 dqb_ihardlimit; /* absolute limit on allocated inodes */
38 __le32 dqb_isoftlimit; /* preferred inode limit */
39 __le32 dqb_curinodes; /* current # allocated inodes */
40 __le32 dqb_bhardlimit; /* absolute limit on disk space (in QUOTABLOCK_SIZE) */
41 __le32 dqb_bsoftlimit; /* preferred limit on disk space (in QUOTABLOCK_SIZE) */
42 __le64 dqb_curspace; /* current space occupied (in bytes) */
43 __le64 dqb_btime; /* time limit for excessive disk use */
44 __le64 dqb_itime; /* time limit for excessive inode use */
45};
46
47/* Header with type and version specific information */
48struct v2_disk_dqinfo {
49 __le32 dqi_bgrace; /* Time before block soft limit becomes hard limit */
50 __le32 dqi_igrace; /* Time before inode soft limit becomes hard limit */
51 __le32 dqi_flags; /* Flags for quotafile (DQF_*) */
52 __le32 dqi_blocks; /* Number of blocks in file */
53 __le32 dqi_free_blk; /* Number of first free block in the list */
54 __le32 dqi_free_entry; /* Number of block with at least one free entry */
55};
56
57#define V2_DQINFOOFF sizeof(struct v2_disk_dqheader) /* Offset of info header in file */
58#define V2_DQBLKSIZE_BITS 10 /* Size of leaf block in tree */
59
60#endif /* _LINUX_QUOTAIO_V2_H */
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 76acdbc34611..b9b567a28376 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -262,11 +262,11 @@ unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
262 ret = -ENOMEM; 262 ret = -ENOMEM;
263 pages = kzalloc(lpages * sizeof(struct page *), GFP_KERNEL); 263 pages = kzalloc(lpages * sizeof(struct page *), GFP_KERNEL);
264 if (!pages) 264 if (!pages)
265 goto out; 265 goto out_free;
266 266
267 nr = find_get_pages(inode->i_mapping, pgoff, lpages, pages); 267 nr = find_get_pages(inode->i_mapping, pgoff, lpages, pages);
268 if (nr != lpages) 268 if (nr != lpages)
269 goto out; /* leave if some pages were missing */ 269 goto out_free_pages; /* leave if some pages were missing */
270 270
271 /* check the pages for physical adjacency */ 271 /* check the pages for physical adjacency */
272 ptr = pages; 272 ptr = pages;
@@ -274,19 +274,18 @@ unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
274 page++; 274 page++;
275 for (loop = lpages; loop > 1; loop--) 275 for (loop = lpages; loop > 1; loop--)
276 if (*ptr++ != page++) 276 if (*ptr++ != page++)
277 goto out; 277 goto out_free_pages;
278 278
279 /* okay - all conditions fulfilled */ 279 /* okay - all conditions fulfilled */
280 ret = (unsigned long) page_address(pages[0]); 280 ret = (unsigned long) page_address(pages[0]);
281 281
282 out: 282out_free_pages:
283 if (pages) { 283 ptr = pages;
284 ptr = pages; 284 for (loop = nr; loop > 0; loop--)
285 for (loop = lpages; loop > 0; loop--) 285 put_page(*ptr++);
286 put_page(*ptr++); 286out_free:
287 kfree(pages); 287 kfree(pages);
288 } 288out:
289
290 return ret; 289 return ret;
291} 290}
292 291
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index a83a3518ae33..b7e6ac706b87 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -57,7 +57,6 @@ struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev)
57 inode->i_mode = mode; 57 inode->i_mode = mode;
58 inode->i_uid = current_fsuid(); 58 inode->i_uid = current_fsuid();
59 inode->i_gid = current_fsgid(); 59 inode->i_gid = current_fsgid();
60 inode->i_blocks = 0;
61 inode->i_mapping->a_ops = &ramfs_aops; 60 inode->i_mapping->a_ops = &ramfs_aops;
62 inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info; 61 inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info;
63 mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); 62 mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
diff --git a/fs/read_write.c b/fs/read_write.c
index 969a6d9c020b..400fe81c973e 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -50,6 +50,14 @@ generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)
50 offset += inode->i_size; 50 offset += inode->i_size;
51 break; 51 break;
52 case SEEK_CUR: 52 case SEEK_CUR:
53 /*
54 * Here we special-case the lseek(fd, 0, SEEK_CUR)
55 * position-querying operation. Avoid rewriting the "same"
56 * f_pos value back to the file because a concurrent read(),
57 * write() or lseek() might have altered it
58 */
59 if (offset == 0)
60 return file->f_pos;
53 offset += file->f_pos; 61 offset += file->f_pos;
54 break; 62 break;
55 } 63 }
@@ -105,6 +113,10 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin)
105 offset += i_size_read(file->f_path.dentry->d_inode); 113 offset += i_size_read(file->f_path.dentry->d_inode);
106 break; 114 break;
107 case SEEK_CUR: 115 case SEEK_CUR:
116 if (offset == 0) {
117 retval = file->f_pos;
118 goto out;
119 }
108 offset += file->f_pos; 120 offset += file->f_pos;
109 } 121 }
110 retval = -EINVAL; 122 retval = -EINVAL;
@@ -115,6 +127,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin)
115 } 127 }
116 retval = offset; 128 retval = offset;
117 } 129 }
130out:
118 unlock_kernel(); 131 unlock_kernel();
119 return retval; 132 return retval;
120} 133}
@@ -134,7 +147,7 @@ loff_t vfs_llseek(struct file *file, loff_t offset, int origin)
134} 147}
135EXPORT_SYMBOL(vfs_llseek); 148EXPORT_SYMBOL(vfs_llseek);
136 149
137asmlinkage off_t sys_lseek(unsigned int fd, off_t offset, unsigned int origin) 150SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, origin)
138{ 151{
139 off_t retval; 152 off_t retval;
140 struct file * file; 153 struct file * file;
@@ -158,9 +171,9 @@ bad:
158} 171}
159 172
160#ifdef __ARCH_WANT_SYS_LLSEEK 173#ifdef __ARCH_WANT_SYS_LLSEEK
161asmlinkage long sys_llseek(unsigned int fd, unsigned long offset_high, 174SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
162 unsigned long offset_low, loff_t __user * result, 175 unsigned long, offset_low, loff_t __user *, result,
163 unsigned int origin) 176 unsigned int, origin)
164{ 177{
165 int retval; 178 int retval;
166 struct file * file; 179 struct file * file;
@@ -356,7 +369,7 @@ static inline void file_pos_write(struct file *file, loff_t pos)
356 file->f_pos = pos; 369 file->f_pos = pos;
357} 370}
358 371
359asmlinkage ssize_t sys_read(unsigned int fd, char __user * buf, size_t count) 372SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
360{ 373{
361 struct file *file; 374 struct file *file;
362 ssize_t ret = -EBADF; 375 ssize_t ret = -EBADF;
@@ -373,7 +386,8 @@ asmlinkage ssize_t sys_read(unsigned int fd, char __user * buf, size_t count)
373 return ret; 386 return ret;
374} 387}
375 388
376asmlinkage ssize_t sys_write(unsigned int fd, const char __user * buf, size_t count) 389SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
390 size_t, count)
377{ 391{
378 struct file *file; 392 struct file *file;
379 ssize_t ret = -EBADF; 393 ssize_t ret = -EBADF;
@@ -390,8 +404,8 @@ asmlinkage ssize_t sys_write(unsigned int fd, const char __user * buf, size_t co
390 return ret; 404 return ret;
391} 405}
392 406
393asmlinkage ssize_t sys_pread64(unsigned int fd, char __user *buf, 407SYSCALL_DEFINE(pread64)(unsigned int fd, char __user *buf,
394 size_t count, loff_t pos) 408 size_t count, loff_t pos)
395{ 409{
396 struct file *file; 410 struct file *file;
397 ssize_t ret = -EBADF; 411 ssize_t ret = -EBADF;
@@ -410,9 +424,17 @@ asmlinkage ssize_t sys_pread64(unsigned int fd, char __user *buf,
410 424
411 return ret; 425 return ret;
412} 426}
427#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
428asmlinkage long SyS_pread64(long fd, long buf, long count, loff_t pos)
429{
430 return SYSC_pread64((unsigned int) fd, (char __user *) buf,
431 (size_t) count, pos);
432}
433SYSCALL_ALIAS(sys_pread64, SyS_pread64);
434#endif
413 435
414asmlinkage ssize_t sys_pwrite64(unsigned int fd, const char __user *buf, 436SYSCALL_DEFINE(pwrite64)(unsigned int fd, const char __user *buf,
415 size_t count, loff_t pos) 437 size_t count, loff_t pos)
416{ 438{
417 struct file *file; 439 struct file *file;
418 ssize_t ret = -EBADF; 440 ssize_t ret = -EBADF;
@@ -431,6 +453,14 @@ asmlinkage ssize_t sys_pwrite64(unsigned int fd, const char __user *buf,
431 453
432 return ret; 454 return ret;
433} 455}
456#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
457asmlinkage long SyS_pwrite64(long fd, long buf, long count, loff_t pos)
458{
459 return SYSC_pwrite64((unsigned int) fd, (const char __user *) buf,
460 (size_t) count, pos);
461}
462SYSCALL_ALIAS(sys_pwrite64, SyS_pwrite64);
463#endif
434 464
435/* 465/*
436 * Reduce an iovec's length in-place. Return the resulting number of segments 466 * Reduce an iovec's length in-place. Return the resulting number of segments
@@ -659,8 +689,8 @@ ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
659 689
660EXPORT_SYMBOL(vfs_writev); 690EXPORT_SYMBOL(vfs_writev);
661 691
662asmlinkage ssize_t 692SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
663sys_readv(unsigned long fd, const struct iovec __user *vec, unsigned long vlen) 693 unsigned long, vlen)
664{ 694{
665 struct file *file; 695 struct file *file;
666 ssize_t ret = -EBADF; 696 ssize_t ret = -EBADF;
@@ -680,8 +710,8 @@ sys_readv(unsigned long fd, const struct iovec __user *vec, unsigned long vlen)
680 return ret; 710 return ret;
681} 711}
682 712
683asmlinkage ssize_t 713SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
684sys_writev(unsigned long fd, const struct iovec __user *vec, unsigned long vlen) 714 unsigned long, vlen)
685{ 715{
686 struct file *file; 716 struct file *file;
687 ssize_t ret = -EBADF; 717 ssize_t ret = -EBADF;
@@ -799,7 +829,7 @@ out:
799 return retval; 829 return retval;
800} 830}
801 831
802asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t __user *offset, size_t count) 832SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
803{ 833{
804 loff_t pos; 834 loff_t pos;
805 off_t off; 835 off_t off;
@@ -818,7 +848,7 @@ asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t __user *offset, siz
818 return do_sendfile(out_fd, in_fd, NULL, count, 0); 848 return do_sendfile(out_fd, in_fd, NULL, count, 0);
819} 849}
820 850
821asmlinkage ssize_t sys_sendfile64(int out_fd, int in_fd, loff_t __user *offset, size_t count) 851SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
822{ 852{
823 loff_t pos; 853 loff_t pos;
824 ssize_t ret; 854 ssize_t ret;
diff --git a/fs/readdir.c b/fs/readdir.c
index b318d9b5af2e..7723401f8d8b 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -102,7 +102,8 @@ efault:
102 return -EFAULT; 102 return -EFAULT;
103} 103}
104 104
105asmlinkage long old_readdir(unsigned int fd, struct old_linux_dirent __user * dirent, unsigned int count) 105SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
106 struct old_linux_dirent __user *, dirent, unsigned int, count)
106{ 107{
107 int error; 108 int error;
108 struct file * file; 109 struct file * file;
@@ -187,7 +188,8 @@ efault:
187 return -EFAULT; 188 return -EFAULT;
188} 189}
189 190
190asmlinkage long sys_getdents(unsigned int fd, struct linux_dirent __user * dirent, unsigned int count) 191SYSCALL_DEFINE3(getdents, unsigned int, fd,
192 struct linux_dirent __user *, dirent, unsigned int, count)
191{ 193{
192 struct file * file; 194 struct file * file;
193 struct linux_dirent __user * lastdirent; 195 struct linux_dirent __user * lastdirent;
@@ -268,7 +270,8 @@ efault:
268 return -EFAULT; 270 return -EFAULT;
269} 271}
270 272
271asmlinkage long sys_getdents64(unsigned int fd, struct linux_dirent64 __user * dirent, unsigned int count) 273SYSCALL_DEFINE3(getdents64, unsigned int, fd,
274 struct linux_dirent64 __user *, dirent, unsigned int, count)
272{ 275{
273 struct file * file; 276 struct file * file;
274 struct linux_dirent64 __user * lastdirent; 277 struct linux_dirent64 __user * lastdirent;
diff --git a/fs/reiserfs/Kconfig b/fs/reiserfs/Kconfig
new file mode 100644
index 000000000000..949b8c6addc8
--- /dev/null
+++ b/fs/reiserfs/Kconfig
@@ -0,0 +1,85 @@
1config REISERFS_FS
2 tristate "Reiserfs support"
3 help
4 Stores not just filenames but the files themselves in a balanced
5 tree. Uses journalling.
6
7 Balanced trees are more efficient than traditional file system
8 architectural foundations.
9
10 In general, ReiserFS is as fast as ext2, but is very efficient with
11 large directories and small files. Additional patches are needed
12 for NFS and quotas, please see <http://www.namesys.com/> for links.
13
14 It is more easily extended to have features currently found in
15 database and keyword search systems than block allocation based file
16 systems are. The next version will be so extended, and will support
17 plugins consistent with our motto ``It takes more than a license to
18 make source code open.''
19
20 Read <http://www.namesys.com/> to learn more about reiserfs.
21
22 Sponsored by Threshold Networks, Emusic.com, and Bigstorage.com.
23
24 If you like it, you can pay us to add new features to it that you
25 need, buy a support contract, or pay us to port it to another OS.
26
27config REISERFS_CHECK
28 bool "Enable reiserfs debug mode"
29 depends on REISERFS_FS
30 help
31 If you set this to Y, then ReiserFS will perform every check it can
32 possibly imagine of its internal consistency throughout its
33 operation. It will also go substantially slower. More than once we
34 have forgotten that this was on, and then gone despondent over the
35 latest benchmarks.:-) Use of this option allows our team to go all
36 out in checking for consistency when debugging without fear of its
37 effect on end users. If you are on the verge of sending in a bug
38 report, say Y and you might get a useful error message. Almost
39 everyone should say N.
40
41config REISERFS_PROC_INFO
42 bool "Stats in /proc/fs/reiserfs"
43 depends on REISERFS_FS && PROC_FS
44 help
45 Create under /proc/fs/reiserfs a hierarchy of files, displaying
46 various ReiserFS statistics and internal data at the expense of
47 making your kernel or module slightly larger (+8 KB). This also
48 increases the amount of kernel memory required for each mount.
49 Almost everyone but ReiserFS developers and people fine-tuning
50 reiserfs or tracing problems should say N.
51
52config REISERFS_FS_XATTR
53 bool "ReiserFS extended attributes"
54 depends on REISERFS_FS
55 help
56 Extended attributes are name:value pairs associated with inodes by
57 the kernel or by users (see the attr(5) manual page, or visit
58 <http://acl.bestbits.at/> for details).
59
60 If unsure, say N.
61
62config REISERFS_FS_POSIX_ACL
63 bool "ReiserFS POSIX Access Control Lists"
64 depends on REISERFS_FS_XATTR
65 select FS_POSIX_ACL
66 help
67 Posix Access Control Lists (ACLs) support permissions for users and
68 groups beyond the owner/group/world scheme.
69
70 To learn more about Access Control Lists, visit the Posix ACLs for
71 Linux website <http://acl.bestbits.at/>.
72
73 If you don't know what Access Control Lists are, say N
74
75config REISERFS_FS_SECURITY
76 bool "ReiserFS Security Labels"
77 depends on REISERFS_FS_XATTR
78 help
79 Security labels support alternative access control models
80 implemented by security modules like SELinux. This option
81 enables an extended attribute handler for file security
82 labels in the ReiserFS filesystem.
83
84 If you are not using a security module that requires using
85 extended attributes for file security labels, say N.
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 6c4c2c69449f..55fce92cdf18 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1753,6 +1753,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
1753 struct inode *inode) 1753 struct inode *inode)
1754{ 1754{
1755 struct super_block *sb; 1755 struct super_block *sb;
1756 struct reiserfs_iget_args args;
1756 INITIALIZE_PATH(path_to_key); 1757 INITIALIZE_PATH(path_to_key);
1757 struct cpu_key key; 1758 struct cpu_key key;
1758 struct item_head ih; 1759 struct item_head ih;
@@ -1780,6 +1781,20 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
1780 err = -ENOMEM; 1781 err = -ENOMEM;
1781 goto out_bad_inode; 1782 goto out_bad_inode;
1782 } 1783 }
1784 args.objectid = inode->i_ino = le32_to_cpu(ih.ih_key.k_objectid);
1785 if (old_format_only(sb))
1786 make_le_item_head(&ih, NULL, KEY_FORMAT_3_5, SD_OFFSET,
1787 TYPE_STAT_DATA, SD_V1_SIZE, MAX_US_INT);
1788 else
1789 make_le_item_head(&ih, NULL, KEY_FORMAT_3_6, SD_OFFSET,
1790 TYPE_STAT_DATA, SD_SIZE, MAX_US_INT);
1791 memcpy(INODE_PKEY(inode), &(ih.ih_key), KEY_SIZE);
1792 args.dirid = le32_to_cpu(ih.ih_key.k_dir_id);
1793 if (insert_inode_locked4(inode, args.objectid,
1794 reiserfs_find_actor, &args) < 0) {
1795 err = -EINVAL;
1796 goto out_bad_inode;
1797 }
1783 if (old_format_only(sb)) 1798 if (old_format_only(sb))
1784 /* not a perfect generation count, as object ids can be reused, but 1799 /* not a perfect generation count, as object ids can be reused, but
1785 ** this is as good as reiserfs can do right now. 1800 ** this is as good as reiserfs can do right now.
@@ -1825,13 +1840,6 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
1825 reiserfs_init_acl_default(inode); 1840 reiserfs_init_acl_default(inode);
1826 reiserfs_init_xattr_rwsem(inode); 1841 reiserfs_init_xattr_rwsem(inode);
1827 1842
1828 if (old_format_only(sb))
1829 make_le_item_head(&ih, NULL, KEY_FORMAT_3_5, SD_OFFSET,
1830 TYPE_STAT_DATA, SD_V1_SIZE, MAX_US_INT);
1831 else
1832 make_le_item_head(&ih, NULL, KEY_FORMAT_3_6, SD_OFFSET,
1833 TYPE_STAT_DATA, SD_SIZE, MAX_US_INT);
1834
1835 /* key to search for correct place for new stat data */ 1843 /* key to search for correct place for new stat data */
1836 _make_cpu_key(&key, KEY_FORMAT_3_6, le32_to_cpu(ih.ih_key.k_dir_id), 1844 _make_cpu_key(&key, KEY_FORMAT_3_6, le32_to_cpu(ih.ih_key.k_dir_id),
1837 le32_to_cpu(ih.ih_key.k_objectid), SD_OFFSET, 1845 le32_to_cpu(ih.ih_key.k_objectid), SD_OFFSET,
@@ -1859,13 +1867,9 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
1859 } else { 1867 } else {
1860 inode2sd(&sd, inode, inode->i_size); 1868 inode2sd(&sd, inode, inode->i_size);
1861 } 1869 }
1862 // these do not go to on-disk stat data
1863 inode->i_ino = le32_to_cpu(ih.ih_key.k_objectid);
1864
1865 // store in in-core inode the key of stat data and version all 1870 // store in in-core inode the key of stat data and version all
1866 // object items will have (directory items will have old offset 1871 // object items will have (directory items will have old offset
1867 // format, other new objects will consist of new items) 1872 // format, other new objects will consist of new items)
1868 memcpy(INODE_PKEY(inode), &(ih.ih_key), KEY_SIZE);
1869 if (old_format_only(sb) || S_ISDIR(mode) || S_ISLNK(mode)) 1873 if (old_format_only(sb) || S_ISDIR(mode) || S_ISLNK(mode))
1870 set_inode_item_key_version(inode, KEY_FORMAT_3_5); 1874 set_inode_item_key_version(inode, KEY_FORMAT_3_5);
1871 else 1875 else
@@ -1929,7 +1933,6 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
1929 reiserfs_mark_inode_private(inode); 1933 reiserfs_mark_inode_private(inode);
1930 } 1934 }
1931 1935
1932 insert_inode_hash(inode);
1933 reiserfs_update_sd(th, inode); 1936 reiserfs_update_sd(th, inode);
1934 reiserfs_check_path(&path_to_key); 1937 reiserfs_check_path(&path_to_key);
1935 1938
@@ -1956,6 +1959,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
1956 out_inserted_sd: 1959 out_inserted_sd:
1957 inode->i_nlink = 0; 1960 inode->i_nlink = 0;
1958 th->t_trans_id = 0; /* so the caller can't use this handle later */ 1961 th->t_trans_id = 0; /* so the caller can't use this handle later */
1962 unlock_new_inode(inode); /* OK to do even if we hadn't locked it */
1959 1963
1960 /* If we were inheriting an ACL, we need to release the lock so that 1964 /* If we were inheriting an ACL, we need to release the lock so that
1961 * iput doesn't deadlock in reiserfs_delete_xattrs. The locking 1965 * iput doesn't deadlock in reiserfs_delete_xattrs. The locking
@@ -2556,7 +2560,7 @@ static int reiserfs_write_begin(struct file *file,
2556 } 2560 }
2557 2561
2558 index = pos >> PAGE_CACHE_SHIFT; 2562 index = pos >> PAGE_CACHE_SHIFT;
2559 page = __grab_cache_page(mapping, index); 2563 page = grab_cache_page_write_begin(mapping, index, flags);
2560 if (!page) 2564 if (!page)
2561 return -ENOMEM; 2565 return -ENOMEM;
2562 *pagep = page; 2566 *pagep = page;
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 4f322e5ed840..738967f6c8ee 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -646,6 +646,7 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode,
646 err = journal_end(&th, dir->i_sb, jbegin_count); 646 err = journal_end(&th, dir->i_sb, jbegin_count);
647 if (err) 647 if (err)
648 retval = err; 648 retval = err;
649 unlock_new_inode(inode);
649 iput(inode); 650 iput(inode);
650 goto out_failed; 651 goto out_failed;
651 } 652 }
@@ -653,6 +654,7 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode,
653 reiserfs_update_inode_transaction(dir); 654 reiserfs_update_inode_transaction(dir);
654 655
655 d_instantiate(dentry, inode); 656 d_instantiate(dentry, inode);
657 unlock_new_inode(inode);
656 retval = journal_end(&th, dir->i_sb, jbegin_count); 658 retval = journal_end(&th, dir->i_sb, jbegin_count);
657 659
658 out_failed: 660 out_failed:
@@ -727,11 +729,13 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, int mode,
727 err = journal_end(&th, dir->i_sb, jbegin_count); 729 err = journal_end(&th, dir->i_sb, jbegin_count);
728 if (err) 730 if (err)
729 retval = err; 731 retval = err;
732 unlock_new_inode(inode);
730 iput(inode); 733 iput(inode);
731 goto out_failed; 734 goto out_failed;
732 } 735 }
733 736
734 d_instantiate(dentry, inode); 737 d_instantiate(dentry, inode);
738 unlock_new_inode(inode);
735 retval = journal_end(&th, dir->i_sb, jbegin_count); 739 retval = journal_end(&th, dir->i_sb, jbegin_count);
736 740
737 out_failed: 741 out_failed:
@@ -812,6 +816,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
812 err = journal_end(&th, dir->i_sb, jbegin_count); 816 err = journal_end(&th, dir->i_sb, jbegin_count);
813 if (err) 817 if (err)
814 retval = err; 818 retval = err;
819 unlock_new_inode(inode);
815 iput(inode); 820 iput(inode);
816 goto out_failed; 821 goto out_failed;
817 } 822 }
@@ -819,6 +824,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
819 reiserfs_update_sd(&th, dir); 824 reiserfs_update_sd(&th, dir);
820 825
821 d_instantiate(dentry, inode); 826 d_instantiate(dentry, inode);
827 unlock_new_inode(inode);
822 retval = journal_end(&th, dir->i_sb, jbegin_count); 828 retval = journal_end(&th, dir->i_sb, jbegin_count);
823 out_failed: 829 out_failed:
824 if (locked) 830 if (locked)
@@ -1096,11 +1102,13 @@ static int reiserfs_symlink(struct inode *parent_dir,
1096 err = journal_end(&th, parent_dir->i_sb, jbegin_count); 1102 err = journal_end(&th, parent_dir->i_sb, jbegin_count);
1097 if (err) 1103 if (err)
1098 retval = err; 1104 retval = err;
1105 unlock_new_inode(inode);
1099 iput(inode); 1106 iput(inode);
1100 goto out_failed; 1107 goto out_failed;
1101 } 1108 }
1102 1109
1103 d_instantiate(dentry, inode); 1110 d_instantiate(dentry, inode);
1111 unlock_new_inode(inode);
1104 retval = journal_end(&th, parent_dir->i_sb, jbegin_count); 1112 retval = journal_end(&th, parent_dir->i_sb, jbegin_count);
1105 out_failed: 1113 out_failed:
1106 reiserfs_write_unlock(parent_dir->i_sb); 1114 reiserfs_write_unlock(parent_dir->i_sb);
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 663a91f5dce8..f3c820b75829 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -83,7 +83,7 @@ static void reiserfs_write_super(struct super_block *s)
83 reiserfs_sync_fs(s, 1); 83 reiserfs_sync_fs(s, 1);
84} 84}
85 85
86static void reiserfs_write_super_lockfs(struct super_block *s) 86static int reiserfs_freeze(struct super_block *s)
87{ 87{
88 struct reiserfs_transaction_handle th; 88 struct reiserfs_transaction_handle th;
89 reiserfs_write_lock(s); 89 reiserfs_write_lock(s);
@@ -101,11 +101,13 @@ static void reiserfs_write_super_lockfs(struct super_block *s)
101 } 101 }
102 s->s_dirt = 0; 102 s->s_dirt = 0;
103 reiserfs_write_unlock(s); 103 reiserfs_write_unlock(s);
104 return 0;
104} 105}
105 106
106static void reiserfs_unlockfs(struct super_block *s) 107static int reiserfs_unfreeze(struct super_block *s)
107{ 108{
108 reiserfs_allow_writes(s); 109 reiserfs_allow_writes(s);
110 return 0;
109} 111}
110 112
111extern const struct in_core_key MAX_IN_CORE_KEY; 113extern const struct in_core_key MAX_IN_CORE_KEY;
@@ -613,8 +615,8 @@ static const struct super_operations reiserfs_sops = {
613 .put_super = reiserfs_put_super, 615 .put_super = reiserfs_put_super,
614 .write_super = reiserfs_write_super, 616 .write_super = reiserfs_write_super,
615 .sync_fs = reiserfs_sync_fs, 617 .sync_fs = reiserfs_sync_fs,
616 .write_super_lockfs = reiserfs_write_super_lockfs, 618 .freeze_fs = reiserfs_freeze,
617 .unlockfs = reiserfs_unlockfs, 619 .unfreeze_fs = reiserfs_unfreeze,
618 .statfs = reiserfs_statfs, 620 .statfs = reiserfs_statfs,
619 .remount_fs = reiserfs_remount, 621 .remount_fs = reiserfs_remount,
620 .show_options = generic_show_options, 622 .show_options = generic_show_options,
@@ -649,6 +651,8 @@ static struct dquot_operations reiserfs_quota_operations = {
649 .release_dquot = reiserfs_release_dquot, 651 .release_dquot = reiserfs_release_dquot,
650 .mark_dirty = reiserfs_mark_dquot_dirty, 652 .mark_dirty = reiserfs_mark_dquot_dirty,
651 .write_info = reiserfs_write_info, 653 .write_info = reiserfs_write_info,
654 .alloc_dquot = dquot_alloc,
655 .destroy_dquot = dquot_destroy,
652}; 656};
653 657
654static struct quotactl_ops reiserfs_qctl_operations = { 658static struct quotactl_ops reiserfs_qctl_operations = {
@@ -994,8 +998,7 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin
994 if (c == 'u' || c == 'g') { 998 if (c == 'u' || c == 'g') {
995 int qtype = c == 'u' ? USRQUOTA : GRPQUOTA; 999 int qtype = c == 'u' ? USRQUOTA : GRPQUOTA;
996 1000
997 if ((sb_any_quota_enabled(s) || 1001 if (sb_any_quota_loaded(s) &&
998 sb_any_quota_suspended(s)) &&
999 (!*arg != !REISERFS_SB(s)->s_qf_names[qtype])) { 1002 (!*arg != !REISERFS_SB(s)->s_qf_names[qtype])) {
1000 reiserfs_warning(s, 1003 reiserfs_warning(s,
1001 "reiserfs_parse_options: cannot change journaled quota options when quota turned on."); 1004 "reiserfs_parse_options: cannot change journaled quota options when quota turned on.");
@@ -1041,8 +1044,7 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin
1041 "reiserfs_parse_options: unknown quota format specified."); 1044 "reiserfs_parse_options: unknown quota format specified.");
1042 return 0; 1045 return 0;
1043 } 1046 }
1044 if ((sb_any_quota_enabled(s) || 1047 if (sb_any_quota_loaded(s) &&
1045 sb_any_quota_suspended(s)) &&
1046 *qfmt != REISERFS_SB(s)->s_jquota_fmt) { 1048 *qfmt != REISERFS_SB(s)->s_jquota_fmt) {
1047 reiserfs_warning(s, 1049 reiserfs_warning(s,
1048 "reiserfs_parse_options: cannot change journaled quota options when quota turned on."); 1050 "reiserfs_parse_options: cannot change journaled quota options when quota turned on.");
@@ -1067,7 +1069,7 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin
1067 } 1069 }
1068 /* This checking is not precise wrt the quota type but for our purposes it is sufficient */ 1070 /* This checking is not precise wrt the quota type but for our purposes it is sufficient */
1069 if (!(*mount_options & (1 << REISERFS_QUOTA)) 1071 if (!(*mount_options & (1 << REISERFS_QUOTA))
1070 && sb_any_quota_enabled(s)) { 1072 && sb_any_quota_loaded(s)) {
1071 reiserfs_warning(s, 1073 reiserfs_warning(s,
1072 "reiserfs_parse_options: quota options must be present when quota is turned on."); 1074 "reiserfs_parse_options: quota options must be present when quota is turned on.");
1073 return 0; 1075 return 0;
diff --git a/fs/romfs/Kconfig b/fs/romfs/Kconfig
new file mode 100644
index 000000000000..1a17020f9faf
--- /dev/null
+++ b/fs/romfs/Kconfig
@@ -0,0 +1,16 @@
1config ROMFS_FS
2 tristate "ROM file system support"
3 depends on BLOCK
4 ---help---
5 This is a very small read-only file system mainly intended for
6 initial ram disks of installation disks, but it could be used for
7 other read-only media as well. Read
8 <file:Documentation/filesystems/romfs.txt> for details.
9
10 To compile this file system support as a module, choose M here: the
11 module will be called romfs. Note that the file system of your
12 root partition (the one containing the directory /) cannot be a
13 module.
14
15 If you don't know whether you need it, then you don't need it:
16 answer N.
diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c
index 60d2f822e87b..98a232f7196b 100644
--- a/fs/romfs/inode.c
+++ b/fs/romfs/inode.c
@@ -490,7 +490,7 @@ static mode_t romfs_modemap[] =
490static struct inode * 490static struct inode *
491romfs_iget(struct super_block *sb, unsigned long ino) 491romfs_iget(struct super_block *sb, unsigned long ino)
492{ 492{
493 int nextfh; 493 int nextfh, ret;
494 struct romfs_inode ri; 494 struct romfs_inode ri;
495 struct inode *i; 495 struct inode *i;
496 496
@@ -524,14 +524,13 @@ romfs_iget(struct super_block *sb, unsigned long ino)
524 i->i_size = be32_to_cpu(ri.size); 524 i->i_size = be32_to_cpu(ri.size);
525 i->i_mtime.tv_sec = i->i_atime.tv_sec = i->i_ctime.tv_sec = 0; 525 i->i_mtime.tv_sec = i->i_atime.tv_sec = i->i_ctime.tv_sec = 0;
526 i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0; 526 i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0;
527 i->i_uid = i->i_gid = 0;
528 527
529 /* Precalculate the data offset */ 528 /* Precalculate the data offset */
530 ino = romfs_strnlen(i, ino+ROMFH_SIZE, ROMFS_MAXFN); 529 ret = romfs_strnlen(i, ino + ROMFH_SIZE, ROMFS_MAXFN);
531 if (ino >= 0) 530 if (ret >= 0)
532 ino = ((ROMFH_SIZE+ino+1+ROMFH_PAD)&ROMFH_MASK); 531 ino = (ROMFH_SIZE + ret + 1 + ROMFH_PAD) & ROMFH_MASK;
533 else 532 else
534 ino = 0; 533 ino = 0;
535 534
536 ROMFS_I(i)->i_metasize = ino; 535 ROMFS_I(i)->i_metasize = ino;
537 ROMFS_I(i)->i_dataoffset = ino+(i->i_ino&ROMFH_MASK); 536 ROMFS_I(i)->i_dataoffset = ino+(i->i_ino&ROMFH_MASK);
diff --git a/fs/select.c b/fs/select.c
index 87df51eadcf2..0fe0e1469df3 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -109,11 +109,11 @@ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
109void poll_initwait(struct poll_wqueues *pwq) 109void poll_initwait(struct poll_wqueues *pwq)
110{ 110{
111 init_poll_funcptr(&pwq->pt, __pollwait); 111 init_poll_funcptr(&pwq->pt, __pollwait);
112 pwq->polling_task = current;
112 pwq->error = 0; 113 pwq->error = 0;
113 pwq->table = NULL; 114 pwq->table = NULL;
114 pwq->inline_index = 0; 115 pwq->inline_index = 0;
115} 116}
116
117EXPORT_SYMBOL(poll_initwait); 117EXPORT_SYMBOL(poll_initwait);
118 118
119static void free_poll_entry(struct poll_table_entry *entry) 119static void free_poll_entry(struct poll_table_entry *entry)
@@ -142,12 +142,10 @@ void poll_freewait(struct poll_wqueues *pwq)
142 free_page((unsigned long) old); 142 free_page((unsigned long) old);
143 } 143 }
144} 144}
145
146EXPORT_SYMBOL(poll_freewait); 145EXPORT_SYMBOL(poll_freewait);
147 146
148static struct poll_table_entry *poll_get_entry(poll_table *_p) 147static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p)
149{ 148{
150 struct poll_wqueues *p = container_of(_p, struct poll_wqueues, pt);
151 struct poll_table_page *table = p->table; 149 struct poll_table_page *table = p->table;
152 150
153 if (p->inline_index < N_INLINE_POLL_ENTRIES) 151 if (p->inline_index < N_INLINE_POLL_ENTRIES)
@@ -159,7 +157,6 @@ static struct poll_table_entry *poll_get_entry(poll_table *_p)
159 new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL); 157 new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL);
160 if (!new_table) { 158 if (!new_table) {
161 p->error = -ENOMEM; 159 p->error = -ENOMEM;
162 __set_current_state(TASK_RUNNING);
163 return NULL; 160 return NULL;
164 } 161 }
165 new_table->entry = new_table->entries; 162 new_table->entry = new_table->entries;
@@ -171,20 +168,75 @@ static struct poll_table_entry *poll_get_entry(poll_table *_p)
171 return table->entry++; 168 return table->entry++;
172} 169}
173 170
171static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
172{
173 struct poll_wqueues *pwq = wait->private;
174 DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);
175
176 /*
177 * Although this function is called under waitqueue lock, LOCK
178 * doesn't imply write barrier and the users expect write
179 * barrier semantics on wakeup functions. The following
180 * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
181 * and is paired with set_mb() in poll_schedule_timeout.
182 */
183 smp_wmb();
184 pwq->triggered = 1;
185
186 /*
187 * Perform the default wake up operation using a dummy
188 * waitqueue.
189 *
190 * TODO: This is hacky but there currently is no interface to
191 * pass in @sync. @sync is scheduled to be removed and once
192 * that happens, wake_up_process() can be used directly.
193 */
194 return default_wake_function(&dummy_wait, mode, sync, key);
195}
196
174/* Add a new entry */ 197/* Add a new entry */
175static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, 198static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
176 poll_table *p) 199 poll_table *p)
177{ 200{
178 struct poll_table_entry *entry = poll_get_entry(p); 201 struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt);
202 struct poll_table_entry *entry = poll_get_entry(pwq);
179 if (!entry) 203 if (!entry)
180 return; 204 return;
181 get_file(filp); 205 get_file(filp);
182 entry->filp = filp; 206 entry->filp = filp;
183 entry->wait_address = wait_address; 207 entry->wait_address = wait_address;
184 init_waitqueue_entry(&entry->wait, current); 208 init_waitqueue_func_entry(&entry->wait, pollwake);
209 entry->wait.private = pwq;
185 add_wait_queue(wait_address, &entry->wait); 210 add_wait_queue(wait_address, &entry->wait);
186} 211}
187 212
213int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
214 ktime_t *expires, unsigned long slack)
215{
216 int rc = -EINTR;
217
218 set_current_state(state);
219 if (!pwq->triggered)
220 rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS);
221 __set_current_state(TASK_RUNNING);
222
223 /*
224 * Prepare for the next iteration.
225 *
226 * The following set_mb() serves two purposes. First, it's
227 * the counterpart rmb of the wmb in pollwake() such that data
228 * written before wake up is always visible after wake up.
229 * Second, the full barrier guarantees that triggered clearing
230 * doesn't pass event check of the next iteration. Note that
231 * this problem doesn't exist for the first iteration as
232 * add_wait_queue() has full barrier semantics.
233 */
234 set_mb(pwq->triggered, 0);
235
236 return rc;
237}
238EXPORT_SYMBOL(poll_schedule_timeout);
239
188/** 240/**
189 * poll_select_set_timeout - helper function to setup the timeout value 241 * poll_select_set_timeout - helper function to setup the timeout value
190 * @to: pointer to timespec variable for the final timeout 242 * @to: pointer to timespec variable for the final timeout
@@ -340,8 +392,6 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
340 for (;;) { 392 for (;;) {
341 unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp; 393 unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
342 394
343 set_current_state(TASK_INTERRUPTIBLE);
344
345 inp = fds->in; outp = fds->out; exp = fds->ex; 395 inp = fds->in; outp = fds->out; exp = fds->ex;
346 rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex; 396 rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;
347 397
@@ -411,10 +461,10 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
411 to = &expire; 461 to = &expire;
412 } 462 }
413 463
414 if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) 464 if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE,
465 to, slack))
415 timed_out = 1; 466 timed_out = 1;
416 } 467 }
417 __set_current_state(TASK_RUNNING);
418 468
419 poll_freewait(&table); 469 poll_freewait(&table);
420 470
@@ -507,8 +557,8 @@ out_nofds:
507 return ret; 557 return ret;
508} 558}
509 559
510asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp, 560SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp,
511 fd_set __user *exp, struct timeval __user *tvp) 561 fd_set __user *, exp, struct timeval __user *, tvp)
512{ 562{
513 struct timespec end_time, *to = NULL; 563 struct timespec end_time, *to = NULL;
514 struct timeval tv; 564 struct timeval tv;
@@ -532,9 +582,9 @@ asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp,
532} 582}
533 583
534#ifdef HAVE_SET_RESTORE_SIGMASK 584#ifdef HAVE_SET_RESTORE_SIGMASK
535asmlinkage long sys_pselect7(int n, fd_set __user *inp, fd_set __user *outp, 585static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp,
536 fd_set __user *exp, struct timespec __user *tsp, 586 fd_set __user *exp, struct timespec __user *tsp,
537 const sigset_t __user *sigmask, size_t sigsetsize) 587 const sigset_t __user *sigmask, size_t sigsetsize)
538{ 588{
539 sigset_t ksigmask, sigsaved; 589 sigset_t ksigmask, sigsaved;
540 struct timespec ts, end_time, *to = NULL; 590 struct timespec ts, end_time, *to = NULL;
@@ -560,7 +610,7 @@ asmlinkage long sys_pselect7(int n, fd_set __user *inp, fd_set __user *outp,
560 sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); 610 sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
561 } 611 }
562 612
563 ret = core_sys_select(n, inp, outp, exp, &end_time); 613 ret = core_sys_select(n, inp, outp, exp, to);
564 ret = poll_select_copy_remaining(&end_time, tsp, 0, ret); 614 ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
565 615
566 if (ret == -ERESTARTNOHAND) { 616 if (ret == -ERESTARTNOHAND) {
@@ -586,8 +636,9 @@ asmlinkage long sys_pselect7(int n, fd_set __user *inp, fd_set __user *outp,
586 * which has a pointer to the sigset_t itself followed by a size_t containing 636 * which has a pointer to the sigset_t itself followed by a size_t containing
587 * the sigset size. 637 * the sigset size.
588 */ 638 */
589asmlinkage long sys_pselect6(int n, fd_set __user *inp, fd_set __user *outp, 639SYSCALL_DEFINE6(pselect6, int, n, fd_set __user *, inp, fd_set __user *, outp,
590 fd_set __user *exp, struct timespec __user *tsp, void __user *sig) 640 fd_set __user *, exp, struct timespec __user *, tsp,
641 void __user *, sig)
591{ 642{
592 size_t sigsetsize = 0; 643 size_t sigsetsize = 0;
593 sigset_t __user *up = NULL; 644 sigset_t __user *up = NULL;
@@ -600,7 +651,7 @@ asmlinkage long sys_pselect6(int n, fd_set __user *inp, fd_set __user *outp,
600 return -EFAULT; 651 return -EFAULT;
601 } 652 }
602 653
603 return sys_pselect7(n, inp, outp, exp, tsp, up, sigsetsize); 654 return do_pselect(n, inp, outp, exp, tsp, up, sigsetsize);
604} 655}
605#endif /* HAVE_SET_RESTORE_SIGMASK */ 656#endif /* HAVE_SET_RESTORE_SIGMASK */
606 657
@@ -666,7 +717,6 @@ static int do_poll(unsigned int nfds, struct poll_list *list,
666 for (;;) { 717 for (;;) {
667 struct poll_list *walk; 718 struct poll_list *walk;
668 719
669 set_current_state(TASK_INTERRUPTIBLE);
670 for (walk = list; walk != NULL; walk = walk->next) { 720 for (walk = list; walk != NULL; walk = walk->next) {
671 struct pollfd * pfd, * pfd_end; 721 struct pollfd * pfd, * pfd_end;
672 722
@@ -709,10 +759,9 @@ static int do_poll(unsigned int nfds, struct poll_list *list,
709 to = &expire; 759 to = &expire;
710 } 760 }
711 761
712 if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) 762 if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))
713 timed_out = 1; 763 timed_out = 1;
714 } 764 }
715 __set_current_state(TASK_RUNNING);
716 return count; 765 return count;
717} 766}
718 767
@@ -806,8 +855,8 @@ static long do_restart_poll(struct restart_block *restart_block)
806 return ret; 855 return ret;
807} 856}
808 857
809asmlinkage long sys_poll(struct pollfd __user *ufds, unsigned int nfds, 858SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
810 long timeout_msecs) 859 long, timeout_msecs)
811{ 860{
812 struct timespec end_time, *to = NULL; 861 struct timespec end_time, *to = NULL;
813 int ret; 862 int ret;
@@ -841,9 +890,9 @@ asmlinkage long sys_poll(struct pollfd __user *ufds, unsigned int nfds,
841} 890}
842 891
843#ifdef HAVE_SET_RESTORE_SIGMASK 892#ifdef HAVE_SET_RESTORE_SIGMASK
844asmlinkage long sys_ppoll(struct pollfd __user *ufds, unsigned int nfds, 893SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds,
845 struct timespec __user *tsp, const sigset_t __user *sigmask, 894 struct timespec __user *, tsp, const sigset_t __user *, sigmask,
846 size_t sigsetsize) 895 size_t, sigsetsize)
847{ 896{
848 sigset_t ksigmask, sigsaved; 897 sigset_t ksigmask, sigsaved;
849 struct timespec ts, end_time, *to = NULL; 898 struct timespec ts, end_time, *to = NULL;
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 16c211558c22..5267098532bf 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -54,6 +54,64 @@ int seq_open(struct file *file, const struct seq_operations *op)
54} 54}
55EXPORT_SYMBOL(seq_open); 55EXPORT_SYMBOL(seq_open);
56 56
57static int traverse(struct seq_file *m, loff_t offset)
58{
59 loff_t pos = 0, index;
60 int error = 0;
61 void *p;
62
63 m->version = 0;
64 index = 0;
65 m->count = m->from = 0;
66 if (!offset) {
67 m->index = index;
68 return 0;
69 }
70 if (!m->buf) {
71 m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL);
72 if (!m->buf)
73 return -ENOMEM;
74 }
75 p = m->op->start(m, &index);
76 while (p) {
77 error = PTR_ERR(p);
78 if (IS_ERR(p))
79 break;
80 error = m->op->show(m, p);
81 if (error < 0)
82 break;
83 if (unlikely(error)) {
84 error = 0;
85 m->count = 0;
86 }
87 if (m->count == m->size)
88 goto Eoverflow;
89 if (pos + m->count > offset) {
90 m->from = offset - pos;
91 m->count -= m->from;
92 m->index = index;
93 break;
94 }
95 pos += m->count;
96 m->count = 0;
97 if (pos == offset) {
98 index++;
99 m->index = index;
100 break;
101 }
102 p = m->op->next(m, p, &index);
103 }
104 m->op->stop(m, p);
105 m->index = index;
106 return error;
107
108Eoverflow:
109 m->op->stop(m, p);
110 kfree(m->buf);
111 m->buf = kmalloc(m->size <<= 1, GFP_KERNEL);
112 return !m->buf ? -ENOMEM : -EAGAIN;
113}
114
57/** 115/**
58 * seq_read - ->read() method for sequential files. 116 * seq_read - ->read() method for sequential files.
59 * @file: the file to read from 117 * @file: the file to read from
@@ -186,63 +244,6 @@ Efault:
186} 244}
187EXPORT_SYMBOL(seq_read); 245EXPORT_SYMBOL(seq_read);
188 246
189static int traverse(struct seq_file *m, loff_t offset)
190{
191 loff_t pos = 0, index;
192 int error = 0;
193 void *p;
194
195 m->version = 0;
196 index = 0;
197 m->count = m->from = 0;
198 if (!offset) {
199 m->index = index;
200 return 0;
201 }
202 if (!m->buf) {
203 m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL);
204 if (!m->buf)
205 return -ENOMEM;
206 }
207 p = m->op->start(m, &index);
208 while (p) {
209 error = PTR_ERR(p);
210 if (IS_ERR(p))
211 break;
212 error = m->op->show(m, p);
213 if (error < 0)
214 break;
215 if (unlikely(error)) {
216 error = 0;
217 m->count = 0;
218 }
219 if (m->count == m->size)
220 goto Eoverflow;
221 if (pos + m->count > offset) {
222 m->from = offset - pos;
223 m->count -= m->from;
224 m->index = index;
225 break;
226 }
227 pos += m->count;
228 m->count = 0;
229 if (pos == offset) {
230 index++;
231 m->index = index;
232 break;
233 }
234 p = m->op->next(m, p, &index);
235 }
236 m->op->stop(m, p);
237 return error;
238
239Eoverflow:
240 m->op->stop(m, p);
241 kfree(m->buf);
242 m->buf = kmalloc(m->size <<= 1, GFP_KERNEL);
243 return !m->buf ? -ENOMEM : -EAGAIN;
244}
245
246/** 247/**
247 * seq_lseek - ->llseek() method for sequential files. 248 * seq_lseek - ->llseek() method for sequential files.
248 * @file: the file in question 249 * @file: the file in question
@@ -389,8 +390,14 @@ char *mangle_path(char *s, char *p, char *esc)
389} 390}
390EXPORT_SYMBOL(mangle_path); 391EXPORT_SYMBOL(mangle_path);
391 392
392/* 393/**
393 * return the absolute path of 'dentry' residing in mount 'mnt'. 394 * seq_path - seq_file interface to print a pathname
395 * @m: the seq_file handle
396 * @path: the struct path to print
397 * @esc: set of characters to escape in the output
398 *
399 * return the absolute path of 'path', as represented by the
400 * dentry / mnt pair in the path parameter.
394 */ 401 */
395int seq_path(struct seq_file *m, struct path *path, char *esc) 402int seq_path(struct seq_file *m, struct path *path, char *esc)
396{ 403{
@@ -462,7 +469,8 @@ int seq_dentry(struct seq_file *m, struct dentry *dentry, char *esc)
462 return -1; 469 return -1;
463} 470}
464 471
465int seq_bitmap(struct seq_file *m, unsigned long *bits, unsigned int nr_bits) 472int seq_bitmap(struct seq_file *m, const unsigned long *bits,
473 unsigned int nr_bits)
466{ 474{
467 if (m->count < m->size) { 475 if (m->count < m->size) {
468 int len = bitmap_scnprintf(m->buf + m->count, 476 int len = bitmap_scnprintf(m->buf + m->count,
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 9c39bc7f8431..b07565c94386 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -205,8 +205,8 @@ static const struct file_operations signalfd_fops = {
205 .read = signalfd_read, 205 .read = signalfd_read,
206}; 206};
207 207
208asmlinkage long sys_signalfd4(int ufd, sigset_t __user *user_mask, 208SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask,
209 size_t sizemask, int flags) 209 size_t, sizemask, int, flags)
210{ 210{
211 sigset_t sigmask; 211 sigset_t sigmask;
212 struct signalfd_ctx *ctx; 212 struct signalfd_ctx *ctx;
@@ -259,8 +259,8 @@ asmlinkage long sys_signalfd4(int ufd, sigset_t __user *user_mask,
259 return ufd; 259 return ufd;
260} 260}
261 261
262asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask, 262SYSCALL_DEFINE3(signalfd, int, ufd, sigset_t __user *, user_mask,
263 size_t sizemask) 263 size_t, sizemask)
264{ 264{
265 return sys_signalfd4(ufd, user_mask, sizemask, 0); 265 return sys_signalfd4(ufd, user_mask, sizemask, 0);
266} 266}
diff --git a/fs/smbfs/Kconfig b/fs/smbfs/Kconfig
new file mode 100644
index 000000000000..e668127c8b2e
--- /dev/null
+++ b/fs/smbfs/Kconfig
@@ -0,0 +1,55 @@
1config SMB_FS
2 tristate "SMB file system support (OBSOLETE, please use CIFS)"
3 depends on INET
4 select NLS
5 help
6 SMB (Server Message Block) is the protocol Windows for Workgroups
7 (WfW), Windows 95/98, Windows NT and OS/2 Lan Manager use to share
8 files and printers over local networks. Saying Y here allows you to
9 mount their file systems (often called "shares" in this context) and
10 access them just like any other Unix directory. Currently, this
11 works only if the Windows machines use TCP/IP as the underlying
12 transport protocol, and not NetBEUI. For details, read
13 <file:Documentation/filesystems/smbfs.txt> and the SMB-HOWTO,
14 available from <http://www.tldp.org/docs.html#howto>.
15
16 Note: if you just want your box to act as an SMB *server* and make
17 files and printing services available to Windows clients (which need
18 to have a TCP/IP stack), you don't need to say Y here; you can use
19 the program SAMBA (available from <ftp://ftp.samba.org/pub/samba/>)
20 for that.
21
22 General information about how to connect Linux, Windows machines and
23 Macs is on the WWW at <http://www.eats.com/linux_mac_win.html>.
24
25 To compile the SMB support as a module, choose M here:
26 the module will be called smbfs. Most people say N, however.
27
28config SMB_NLS_DEFAULT
29 bool "Use a default NLS"
30 depends on SMB_FS
31 help
32 Enabling this will make smbfs use nls translations by default. You
33 need to specify the local charset (CONFIG_NLS_DEFAULT) in the nls
34 settings and you need to give the default nls for the SMB server as
35 CONFIG_SMB_NLS_REMOTE.
36
37 The nls settings can be changed at mount time, if your smbmount
38 supports that, using the codepage and iocharset parameters.
39
40 smbmount from samba 2.2.0 or later supports this.
41
42config SMB_NLS_REMOTE
43 string "Default Remote NLS Option"
44 depends on SMB_NLS_DEFAULT
45 default "cp437"
46 help
47 This setting allows you to specify a default value for which
48 codepage the server uses. If this field is left blank no
49 translations will be done by default. The local codepage/charset
50 default to CONFIG_NLS_DEFAULT.
51
52 The nls settings can be changed at mount time, if your smbmount
53 supports that, using the codepage and iocharset parameters.
54
55 smbmount from samba 2.2.0 or later supports this.
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index e4f8d51a5553..92d5e8ffb639 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -297,7 +297,7 @@ static int smb_write_begin(struct file *file, struct address_space *mapping,
297 struct page **pagep, void **fsdata) 297 struct page **pagep, void **fsdata)
298{ 298{
299 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 299 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
300 *pagep = __grab_cache_page(mapping, index); 300 *pagep = grab_cache_page_write_begin(mapping, index, flags);
301 if (!*pagep) 301 if (!*pagep)
302 return -ENOMEM; 302 return -ENOMEM;
303 return 0; 303 return 0;
diff --git a/fs/splice.c b/fs/splice.c
index 1abab5cee4ba..4ed0ba44a966 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -21,6 +21,7 @@
21#include <linux/file.h> 21#include <linux/file.h>
22#include <linux/pagemap.h> 22#include <linux/pagemap.h>
23#include <linux/splice.h> 23#include <linux/splice.h>
24#include <linux/memcontrol.h>
24#include <linux/mm_inline.h> 25#include <linux/mm_inline.h>
25#include <linux/swap.h> 26#include <linux/swap.h>
26#include <linux/writeback.h> 27#include <linux/writeback.h>
@@ -1434,8 +1435,8 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
1434 * Currently we punt and implement it as a normal copy, see pipe_to_user(). 1435 * Currently we punt and implement it as a normal copy, see pipe_to_user().
1435 * 1436 *
1436 */ 1437 */
1437asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov, 1438SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov,
1438 unsigned long nr_segs, unsigned int flags) 1439 unsigned long, nr_segs, unsigned int, flags)
1439{ 1440{
1440 struct file *file; 1441 struct file *file;
1441 long error; 1442 long error;
@@ -1460,9 +1461,9 @@ asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov,
1460 return error; 1461 return error;
1461} 1462}
1462 1463
1463asmlinkage long sys_splice(int fd_in, loff_t __user *off_in, 1464SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
1464 int fd_out, loff_t __user *off_out, 1465 int, fd_out, loff_t __user *, off_out,
1465 size_t len, unsigned int flags) 1466 size_t, len, unsigned int, flags)
1466{ 1467{
1467 long error; 1468 long error;
1468 struct file *in, *out; 1469 struct file *in, *out;
@@ -1684,7 +1685,7 @@ static long do_tee(struct file *in, struct file *out, size_t len,
1684 return ret; 1685 return ret;
1685} 1686}
1686 1687
1687asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags) 1688SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags)
1688{ 1689{
1689 struct file *in; 1690 struct file *in;
1690 int error, fput_in; 1691 int error, fput_in;
diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
new file mode 100644
index 000000000000..25a00d19d686
--- /dev/null
+++ b/fs/squashfs/Kconfig
@@ -0,0 +1,51 @@
1config SQUASHFS
2 tristate "SquashFS 4.0 - Squashed file system support"
3 depends on BLOCK
4 select ZLIB_INFLATE
5 help
6 Saying Y here includes support for SquashFS 4.0 (a Compressed
7 Read-Only File System). Squashfs is a highly compressed read-only
8 filesystem for Linux. It uses zlib compression to compress both
9 files, inodes and directories. Inodes in the system are very small
10 and all blocks are packed to minimise data overhead. Block sizes
11 greater than 4K are supported up to a maximum of 1 Mbytes (default
12 block size 128K). SquashFS 4.0 supports 64 bit filesystems and files
13 (larger than 4GB), full uid/gid information, hard links and
14 timestamps.
15
16 Squashfs is intended for general read-only filesystem use, for
17 archival use (i.e. in cases where a .tar.gz file may be used), and in
18 embedded systems where low overhead is needed. Further information
19 and tools are available from http://squashfs.sourceforge.net.
20
21 If you want to compile this as a module ( = code which can be
22 inserted in and removed from the running kernel whenever you want),
23 say M here and read <file:Documentation/modules.txt>. The module
24 will be called squashfs. Note that the root file system (the one
25 containing the directory /) cannot be compiled as a module.
26
27 If unsure, say N.
28
29config SQUASHFS_EMBEDDED
30
31 bool "Additional option for memory-constrained systems"
32 depends on SQUASHFS
33 default n
34 help
35 Saying Y here allows you to specify cache size.
36
37 If unsure, say N.
38
39config SQUASHFS_FRAGMENT_CACHE_SIZE
40 int "Number of fragments cached" if SQUASHFS_EMBEDDED
41 depends on SQUASHFS
42 default "3"
43 help
44 By default SquashFS caches the last 3 fragments read from
45 the filesystem. Increasing this amount may mean SquashFS
46 has to re-read fragments less often from disk, at the expense
47 of extra system memory. Decreasing this amount will mean
48 SquashFS uses less memory at the expense of extra reads from disk.
49
50 Note there must be at least one cached fragment. Anything
51 much more than three will probably not make much difference.
diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
new file mode 100644
index 000000000000..8258cf9a0317
--- /dev/null
+++ b/fs/squashfs/Makefile
@@ -0,0 +1,8 @@
1#
2# Makefile for the linux squashfs routines.
3#
4
5obj-$(CONFIG_SQUASHFS) += squashfs.o
6squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
7squashfs-y += namei.o super.o symlink.o
8#squashfs-y += squashfs2_0.o
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
new file mode 100644
index 000000000000..c837dfc2b3c6
--- /dev/null
+++ b/fs/squashfs/block.c
@@ -0,0 +1,274 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * block.c
22 */
23
24/*
25 * This file implements the low-level routines to read and decompress
26 * datablocks and metadata blocks.
27 */
28
29#include <linux/fs.h>
30#include <linux/vfs.h>
31#include <linux/slab.h>
32#include <linux/mutex.h>
33#include <linux/string.h>
34#include <linux/buffer_head.h>
35#include <linux/zlib.h>
36
37#include "squashfs_fs.h"
38#include "squashfs_fs_sb.h"
39#include "squashfs_fs_i.h"
40#include "squashfs.h"
41
42/*
43 * Read the metadata block length, this is stored in the first two
44 * bytes of the metadata block.
45 */
46static struct buffer_head *get_block_length(struct super_block *sb,
47 u64 *cur_index, int *offset, int *length)
48{
49 struct squashfs_sb_info *msblk = sb->s_fs_info;
50 struct buffer_head *bh;
51
52 bh = sb_bread(sb, *cur_index);
53 if (bh == NULL)
54 return NULL;
55
56 if (msblk->devblksize - *offset == 1) {
57 *length = (unsigned char) bh->b_data[*offset];
58 put_bh(bh);
59 bh = sb_bread(sb, ++(*cur_index));
60 if (bh == NULL)
61 return NULL;
62 *length |= (unsigned char) bh->b_data[0] << 8;
63 *offset = 1;
64 } else {
65 *length = (unsigned char) bh->b_data[*offset] |
66 (unsigned char) bh->b_data[*offset + 1] << 8;
67 *offset += 2;
68 }
69
70 return bh;
71}
72
73
74/*
75 * Read and decompress a metadata block or datablock. Length is non-zero
76 * if a datablock is being read (the size is stored elsewhere in the
77 * filesystem), otherwise the length is obtained from the first two bytes of
78 * the metadata block. A bit in the length field indicates if the block
79 * is stored uncompressed in the filesystem (usually because compression
80 * generated a larger block - this does occasionally happen with zlib).
81 */
82int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
83 int length, u64 *next_index, int srclength)
84{
85 struct squashfs_sb_info *msblk = sb->s_fs_info;
86 struct buffer_head **bh;
87 int offset = index & ((1 << msblk->devblksize_log2) - 1);
88 u64 cur_index = index >> msblk->devblksize_log2;
89 int bytes, compressed, b = 0, k = 0, page = 0, avail;
90
91
92 bh = kcalloc((msblk->block_size >> msblk->devblksize_log2) + 1,
93 sizeof(*bh), GFP_KERNEL);
94 if (bh == NULL)
95 return -ENOMEM;
96
97 if (length) {
98 /*
99 * Datablock.
100 */
101 bytes = -offset;
102 compressed = SQUASHFS_COMPRESSED_BLOCK(length);
103 length = SQUASHFS_COMPRESSED_SIZE_BLOCK(length);
104 if (next_index)
105 *next_index = index + length;
106
107 TRACE("Block @ 0x%llx, %scompressed size %d, src size %d\n",
108 index, compressed ? "" : "un", length, srclength);
109
110 if (length < 0 || length > srclength ||
111 (index + length) > msblk->bytes_used)
112 goto read_failure;
113
114 for (b = 0; bytes < length; b++, cur_index++) {
115 bh[b] = sb_getblk(sb, cur_index);
116 if (bh[b] == NULL)
117 goto block_release;
118 bytes += msblk->devblksize;
119 }
120 ll_rw_block(READ, b, bh);
121 } else {
122 /*
123 * Metadata block.
124 */
125 if ((index + 2) > msblk->bytes_used)
126 goto read_failure;
127
128 bh[0] = get_block_length(sb, &cur_index, &offset, &length);
129 if (bh[0] == NULL)
130 goto read_failure;
131 b = 1;
132
133 bytes = msblk->devblksize - offset;
134 compressed = SQUASHFS_COMPRESSED(length);
135 length = SQUASHFS_COMPRESSED_SIZE(length);
136 if (next_index)
137 *next_index = index + length + 2;
138
139 TRACE("Block @ 0x%llx, %scompressed size %d\n", index,
140 compressed ? "" : "un", length);
141
142 if (length < 0 || length > srclength ||
143 (index + length) > msblk->bytes_used)
144 goto block_release;
145
146 for (; bytes < length; b++) {
147 bh[b] = sb_getblk(sb, ++cur_index);
148 if (bh[b] == NULL)
149 goto block_release;
150 bytes += msblk->devblksize;
151 }
152 ll_rw_block(READ, b - 1, bh + 1);
153 }
154
155 if (compressed) {
156 int zlib_err = 0, zlib_init = 0;
157
158 /*
159 * Uncompress block.
160 */
161
162 mutex_lock(&msblk->read_data_mutex);
163
164 msblk->stream.avail_out = 0;
165 msblk->stream.avail_in = 0;
166
167 bytes = length;
168 do {
169 if (msblk->stream.avail_in == 0 && k < b) {
170 avail = min(bytes, msblk->devblksize - offset);
171 bytes -= avail;
172 wait_on_buffer(bh[k]);
173 if (!buffer_uptodate(bh[k]))
174 goto release_mutex;
175
176 if (avail == 0) {
177 offset = 0;
178 put_bh(bh[k++]);
179 continue;
180 }
181
182 msblk->stream.next_in = bh[k]->b_data + offset;
183 msblk->stream.avail_in = avail;
184 offset = 0;
185 }
186
187 if (msblk->stream.avail_out == 0) {
188 msblk->stream.next_out = buffer[page++];
189 msblk->stream.avail_out = PAGE_CACHE_SIZE;
190 }
191
192 if (!zlib_init) {
193 zlib_err = zlib_inflateInit(&msblk->stream);
194 if (zlib_err != Z_OK) {
195 ERROR("zlib_inflateInit returned"
196 " unexpected result 0x%x,"
197 " srclength %d\n", zlib_err,
198 srclength);
199 goto release_mutex;
200 }
201 zlib_init = 1;
202 }
203
204 zlib_err = zlib_inflate(&msblk->stream, Z_NO_FLUSH);
205
206 if (msblk->stream.avail_in == 0 && k < b)
207 put_bh(bh[k++]);
208 } while (zlib_err == Z_OK);
209
210 if (zlib_err != Z_STREAM_END) {
211 ERROR("zlib_inflate returned unexpected result"
212 " 0x%x, srclength %d, avail_in %d,"
213 " avail_out %d\n", zlib_err, srclength,
214 msblk->stream.avail_in,
215 msblk->stream.avail_out);
216 goto release_mutex;
217 }
218
219 zlib_err = zlib_inflateEnd(&msblk->stream);
220 if (zlib_err != Z_OK) {
221 ERROR("zlib_inflateEnd returned unexpected result 0x%x,"
222 " srclength %d\n", zlib_err, srclength);
223 goto release_mutex;
224 }
225 length = msblk->stream.total_out;
226 mutex_unlock(&msblk->read_data_mutex);
227 } else {
228 /*
229 * Block is uncompressed.
230 */
231 int i, in, pg_offset = 0;
232
233 for (i = 0; i < b; i++) {
234 wait_on_buffer(bh[i]);
235 if (!buffer_uptodate(bh[i]))
236 goto block_release;
237 }
238
239 for (bytes = length; k < b; k++) {
240 in = min(bytes, msblk->devblksize - offset);
241 bytes -= in;
242 while (in) {
243 if (pg_offset == PAGE_CACHE_SIZE) {
244 page++;
245 pg_offset = 0;
246 }
247 avail = min_t(int, in, PAGE_CACHE_SIZE -
248 pg_offset);
249 memcpy(buffer[page] + pg_offset,
250 bh[k]->b_data + offset, avail);
251 in -= avail;
252 pg_offset += avail;
253 offset += avail;
254 }
255 offset = 0;
256 put_bh(bh[k]);
257 }
258 }
259
260 kfree(bh);
261 return length;
262
263release_mutex:
264 mutex_unlock(&msblk->read_data_mutex);
265
266block_release:
267 for (; k < b; k++)
268 put_bh(bh[k]);
269
270read_failure:
271 ERROR("sb_bread failed reading block 0x%llx\n", cur_index);
272 kfree(bh);
273 return -EIO;
274}
diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
new file mode 100644
index 000000000000..f29eda16d25e
--- /dev/null
+++ b/fs/squashfs/cache.c
@@ -0,0 +1,412 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * cache.c
22 */
23
24/*
25 * Blocks in Squashfs are compressed. To avoid repeatedly decompressing
26 * recently accessed data Squashfs uses two small metadata and fragment caches.
27 *
28 * This file implements a generic cache implementation used for both caches,
29 * plus functions layered ontop of the generic cache implementation to
30 * access the metadata and fragment caches.
31 *
32 * To avoid out of memory and fragmentation isssues with vmalloc the cache
33 * uses sequences of kmalloced PAGE_CACHE_SIZE buffers.
34 *
35 * It should be noted that the cache is not used for file datablocks, these
36 * are decompressed and cached in the page-cache in the normal way. The
37 * cache is only used to temporarily cache fragment and metadata blocks
38 * which have been read as as a result of a metadata (i.e. inode or
39 * directory) or fragment access. Because metadata and fragments are packed
40 * together into blocks (to gain greater compression) the read of a particular
41 * piece of metadata or fragment will retrieve other metadata/fragments which
42 * have been packed with it, these because of locality-of-reference may be read
43 * in the near future. Temporarily caching them ensures they are available for
44 * near future access without requiring an additional read and decompress.
45 */
46
47#include <linux/fs.h>
48#include <linux/vfs.h>
49#include <linux/slab.h>
50#include <linux/vmalloc.h>
51#include <linux/sched.h>
52#include <linux/spinlock.h>
53#include <linux/wait.h>
54#include <linux/zlib.h>
55#include <linux/pagemap.h>
56
57#include "squashfs_fs.h"
58#include "squashfs_fs_sb.h"
59#include "squashfs_fs_i.h"
60#include "squashfs.h"
61
62/*
63 * Look-up block in cache, and increment usage count. If not in cache, read
64 * and decompress it from disk.
65 */
66struct squashfs_cache_entry *squashfs_cache_get(struct super_block *sb,
67 struct squashfs_cache *cache, u64 block, int length)
68{
69 int i, n;
70 struct squashfs_cache_entry *entry;
71
72 spin_lock(&cache->lock);
73
74 while (1) {
75 for (i = 0; i < cache->entries; i++)
76 if (cache->entry[i].block == block)
77 break;
78
79 if (i == cache->entries) {
80 /*
81 * Block not in cache, if all cache entries are used
82 * go to sleep waiting for one to become available.
83 */
84 if (cache->unused == 0) {
85 cache->num_waiters++;
86 spin_unlock(&cache->lock);
87 wait_event(cache->wait_queue, cache->unused);
88 spin_lock(&cache->lock);
89 cache->num_waiters--;
90 continue;
91 }
92
93 /*
94 * At least one unused cache entry. A simple
95 * round-robin strategy is used to choose the entry to
96 * be evicted from the cache.
97 */
98 i = cache->next_blk;
99 for (n = 0; n < cache->entries; n++) {
100 if (cache->entry[i].refcount == 0)
101 break;
102 i = (i + 1) % cache->entries;
103 }
104
105 cache->next_blk = (i + 1) % cache->entries;
106 entry = &cache->entry[i];
107
108 /*
109 * Initialise choosen cache entry, and fill it in from
110 * disk.
111 */
112 cache->unused--;
113 entry->block = block;
114 entry->refcount = 1;
115 entry->pending = 1;
116 entry->num_waiters = 0;
117 entry->error = 0;
118 spin_unlock(&cache->lock);
119
120 entry->length = squashfs_read_data(sb, entry->data,
121 block, length, &entry->next_index,
122 cache->block_size);
123
124 spin_lock(&cache->lock);
125
126 if (entry->length < 0)
127 entry->error = entry->length;
128
129 entry->pending = 0;
130
131 /*
132 * While filling this entry one or more other processes
133 * have looked it up in the cache, and have slept
134 * waiting for it to become available.
135 */
136 if (entry->num_waiters) {
137 spin_unlock(&cache->lock);
138 wake_up_all(&entry->wait_queue);
139 } else
140 spin_unlock(&cache->lock);
141
142 goto out;
143 }
144
145 /*
146 * Block already in cache. Increment refcount so it doesn't
147 * get reused until we're finished with it, if it was
148 * previously unused there's one less cache entry available
149 * for reuse.
150 */
151 entry = &cache->entry[i];
152 if (entry->refcount == 0)
153 cache->unused--;
154 entry->refcount++;
155
156 /*
157 * If the entry is currently being filled in by another process
158 * go to sleep waiting for it to become available.
159 */
160 if (entry->pending) {
161 entry->num_waiters++;
162 spin_unlock(&cache->lock);
163 wait_event(entry->wait_queue, !entry->pending);
164 } else
165 spin_unlock(&cache->lock);
166
167 goto out;
168 }
169
170out:
171 TRACE("Got %s %d, start block %lld, refcount %d, error %d\n",
172 cache->name, i, entry->block, entry->refcount, entry->error);
173
174 if (entry->error)
175 ERROR("Unable to read %s cache entry [%llx]\n", cache->name,
176 block);
177 return entry;
178}
179
180
181/*
182 * Release cache entry, once usage count is zero it can be reused.
183 */
184void squashfs_cache_put(struct squashfs_cache_entry *entry)
185{
186 struct squashfs_cache *cache = entry->cache;
187
188 spin_lock(&cache->lock);
189 entry->refcount--;
190 if (entry->refcount == 0) {
191 cache->unused++;
192 /*
193 * If there's any processes waiting for a block to become
194 * available, wake one up.
195 */
196 if (cache->num_waiters) {
197 spin_unlock(&cache->lock);
198 wake_up(&cache->wait_queue);
199 return;
200 }
201 }
202 spin_unlock(&cache->lock);
203}
204
205/*
206 * Delete cache reclaiming all kmalloced buffers.
207 */
208void squashfs_cache_delete(struct squashfs_cache *cache)
209{
210 int i, j;
211
212 if (cache == NULL)
213 return;
214
215 for (i = 0; i < cache->entries; i++) {
216 if (cache->entry[i].data) {
217 for (j = 0; j < cache->pages; j++)
218 kfree(cache->entry[i].data[j]);
219 kfree(cache->entry[i].data);
220 }
221 }
222
223 kfree(cache->entry);
224 kfree(cache);
225}
226
227
228/*
229 * Initialise cache allocating the specified number of entries, each of
230 * size block_size. To avoid vmalloc fragmentation issues each entry
231 * is allocated as a sequence of kmalloced PAGE_CACHE_SIZE buffers.
232 */
233struct squashfs_cache *squashfs_cache_init(char *name, int entries,
234 int block_size)
235{
236 int i, j;
237 struct squashfs_cache *cache = kzalloc(sizeof(*cache), GFP_KERNEL);
238
239 if (cache == NULL) {
240 ERROR("Failed to allocate %s cache\n", name);
241 return NULL;
242 }
243
244 cache->entry = kcalloc(entries, sizeof(*(cache->entry)), GFP_KERNEL);
245 if (cache->entry == NULL) {
246 ERROR("Failed to allocate %s cache\n", name);
247 goto cleanup;
248 }
249
250 cache->next_blk = 0;
251 cache->unused = entries;
252 cache->entries = entries;
253 cache->block_size = block_size;
254 cache->pages = block_size >> PAGE_CACHE_SHIFT;
255 cache->name = name;
256 cache->num_waiters = 0;
257 spin_lock_init(&cache->lock);
258 init_waitqueue_head(&cache->wait_queue);
259
260 for (i = 0; i < entries; i++) {
261 struct squashfs_cache_entry *entry = &cache->entry[i];
262
263 init_waitqueue_head(&cache->entry[i].wait_queue);
264 entry->cache = cache;
265 entry->block = SQUASHFS_INVALID_BLK;
266 entry->data = kcalloc(cache->pages, sizeof(void *), GFP_KERNEL);
267 if (entry->data == NULL) {
268 ERROR("Failed to allocate %s cache entry\n", name);
269 goto cleanup;
270 }
271
272 for (j = 0; j < cache->pages; j++) {
273 entry->data[j] = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL);
274 if (entry->data[j] == NULL) {
275 ERROR("Failed to allocate %s buffer\n", name);
276 goto cleanup;
277 }
278 }
279 }
280
281 return cache;
282
283cleanup:
284 squashfs_cache_delete(cache);
285 return NULL;
286}
287
288
289/*
290 * Copy upto length bytes from cache entry to buffer starting at offset bytes
291 * into the cache entry. If there's not length bytes then copy the number of
292 * bytes available. In all cases return the number of bytes copied.
293 */
294int squashfs_copy_data(void *buffer, struct squashfs_cache_entry *entry,
295 int offset, int length)
296{
297 int remaining = length;
298
299 if (length == 0)
300 return 0;
301 else if (buffer == NULL)
302 return min(length, entry->length - offset);
303
304 while (offset < entry->length) {
305 void *buff = entry->data[offset / PAGE_CACHE_SIZE]
306 + (offset % PAGE_CACHE_SIZE);
307 int bytes = min_t(int, entry->length - offset,
308 PAGE_CACHE_SIZE - (offset % PAGE_CACHE_SIZE));
309
310 if (bytes >= remaining) {
311 memcpy(buffer, buff, remaining);
312 remaining = 0;
313 break;
314 }
315
316 memcpy(buffer, buff, bytes);
317 buffer += bytes;
318 remaining -= bytes;
319 offset += bytes;
320 }
321
322 return length - remaining;
323}
324
325
326/*
327 * Read length bytes from metadata position <block, offset> (block is the
328 * start of the compressed block on disk, and offset is the offset into
329 * the block once decompressed). Data is packed into consecutive blocks,
330 * and length bytes may require reading more than one block.
331 */
332int squashfs_read_metadata(struct super_block *sb, void *buffer,
333 u64 *block, int *offset, int length)
334{
335 struct squashfs_sb_info *msblk = sb->s_fs_info;
336 int bytes, copied = length;
337 struct squashfs_cache_entry *entry;
338
339 TRACE("Entered squashfs_read_metadata [%llx:%x]\n", *block, *offset);
340
341 while (length) {
342 entry = squashfs_cache_get(sb, msblk->block_cache, *block, 0);
343 if (entry->error)
344 return entry->error;
345 else if (*offset >= entry->length)
346 return -EIO;
347
348 bytes = squashfs_copy_data(buffer, entry, *offset, length);
349 if (buffer)
350 buffer += bytes;
351 length -= bytes;
352 *offset += bytes;
353
354 if (*offset == entry->length) {
355 *block = entry->next_index;
356 *offset = 0;
357 }
358
359 squashfs_cache_put(entry);
360 }
361
362 return copied;
363}
364
365
366/*
367 * Look-up in the fragmment cache the fragment located at <start_block> in the
368 * filesystem. If necessary read and decompress it from disk.
369 */
370struct squashfs_cache_entry *squashfs_get_fragment(struct super_block *sb,
371 u64 start_block, int length)
372{
373 struct squashfs_sb_info *msblk = sb->s_fs_info;
374
375 return squashfs_cache_get(sb, msblk->fragment_cache, start_block,
376 length);
377}
378
379
380/*
381 * Read and decompress the datablock located at <start_block> in the
382 * filesystem. The cache is used here to avoid duplicating locking and
383 * read/decompress code.
384 */
385struct squashfs_cache_entry *squashfs_get_datablock(struct super_block *sb,
386 u64 start_block, int length)
387{
388 struct squashfs_sb_info *msblk = sb->s_fs_info;
389
390 return squashfs_cache_get(sb, msblk->read_page, start_block, length);
391}
392
393
394/*
395 * Read a filesystem table (uncompressed sequence of bytes) from disk
396 */
397int squashfs_read_table(struct super_block *sb, void *buffer, u64 block,
398 int length)
399{
400 int pages = (length + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
401 int i, res;
402 void **data = kcalloc(pages, sizeof(void *), GFP_KERNEL);
403 if (data == NULL)
404 return -ENOMEM;
405
406 for (i = 0; i < pages; i++, buffer += PAGE_CACHE_SIZE)
407 data[i] = buffer;
408 res = squashfs_read_data(sb, data, block, length |
409 SQUASHFS_COMPRESSED_BIT_BLOCK, NULL, length);
410 kfree(data);
411 return res;
412}
diff --git a/fs/squashfs/dir.c b/fs/squashfs/dir.c
new file mode 100644
index 000000000000..566b0eaed868
--- /dev/null
+++ b/fs/squashfs/dir.c
@@ -0,0 +1,235 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * dir.c
22 */
23
24/*
25 * This file implements code to read directories from disk.
26 *
27 * See namei.c for a description of directory organisation on disk.
28 */
29
30#include <linux/fs.h>
31#include <linux/vfs.h>
32#include <linux/slab.h>
33#include <linux/zlib.h>
34
35#include "squashfs_fs.h"
36#include "squashfs_fs_sb.h"
37#include "squashfs_fs_i.h"
38#include "squashfs.h"
39
40static const unsigned char squashfs_filetype_table[] = {
41 DT_UNKNOWN, DT_DIR, DT_REG, DT_LNK, DT_BLK, DT_CHR, DT_FIFO, DT_SOCK
42};
43
44/*
45 * Lookup offset (f_pos) in the directory index, returning the
46 * metadata block containing it.
47 *
48 * If we get an error reading the index then return the part of the index
49 * (if any) we have managed to read - the index isn't essential, just
50 * quicker.
51 */
52static int get_dir_index_using_offset(struct super_block *sb,
53 u64 *next_block, int *next_offset, u64 index_start, int index_offset,
54 int i_count, u64 f_pos)
55{
56 struct squashfs_sb_info *msblk = sb->s_fs_info;
57 int err, i, index, length = 0;
58 struct squashfs_dir_index dir_index;
59
60 TRACE("Entered get_dir_index_using_offset, i_count %d, f_pos %lld\n",
61 i_count, f_pos);
62
63 /*
64 * Translate from external f_pos to the internal f_pos. This
65 * is offset by 3 because we invent "." and ".." entries which are
66 * not actually stored in the directory.
67 */
68 if (f_pos < 3)
69 return f_pos;
70 f_pos -= 3;
71
72 for (i = 0; i < i_count; i++) {
73 err = squashfs_read_metadata(sb, &dir_index, &index_start,
74 &index_offset, sizeof(dir_index));
75 if (err < 0)
76 break;
77
78 index = le32_to_cpu(dir_index.index);
79 if (index > f_pos)
80 /*
81 * Found the index we're looking for.
82 */
83 break;
84
85 err = squashfs_read_metadata(sb, NULL, &index_start,
86 &index_offset, le32_to_cpu(dir_index.size) + 1);
87 if (err < 0)
88 break;
89
90 length = index;
91 *next_block = le32_to_cpu(dir_index.start_block) +
92 msblk->directory_table;
93 }
94
95 *next_offset = (length + *next_offset) % SQUASHFS_METADATA_SIZE;
96
97 /*
98 * Translate back from internal f_pos to external f_pos.
99 */
100 return length + 3;
101}
102
103
104static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir)
105{
106 struct inode *inode = file->f_dentry->d_inode;
107 struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
108 u64 block = squashfs_i(inode)->start + msblk->directory_table;
109 int offset = squashfs_i(inode)->offset, length = 0, dir_count, size,
110 type, err;
111 unsigned int inode_number;
112 struct squashfs_dir_header dirh;
113 struct squashfs_dir_entry *dire;
114
115 TRACE("Entered squashfs_readdir [%llx:%x]\n", block, offset);
116
117 dire = kmalloc(sizeof(*dire) + SQUASHFS_NAME_LEN + 1, GFP_KERNEL);
118 if (dire == NULL) {
119 ERROR("Failed to allocate squashfs_dir_entry\n");
120 goto finish;
121 }
122
123 /*
124 * Return "." and ".." entries as the first two filenames in the
125 * directory. To maximise compression these two entries are not
126 * stored in the directory, and so we invent them here.
127 *
128 * It also means that the external f_pos is offset by 3 from the
129 * on-disk directory f_pos.
130 */
131 while (file->f_pos < 3) {
132 char *name;
133 int i_ino;
134
135 if (file->f_pos == 0) {
136 name = ".";
137 size = 1;
138 i_ino = inode->i_ino;
139 } else {
140 name = "..";
141 size = 2;
142 i_ino = squashfs_i(inode)->parent;
143 }
144
145 TRACE("Calling filldir(%p, %s, %d, %lld, %d, %d)\n",
146 dirent, name, size, file->f_pos, i_ino,
147 squashfs_filetype_table[1]);
148
149 if (filldir(dirent, name, size, file->f_pos, i_ino,
150 squashfs_filetype_table[1]) < 0) {
151 TRACE("Filldir returned less than 0\n");
152 goto finish;
153 }
154
155 file->f_pos += size;
156 }
157
158 length = get_dir_index_using_offset(inode->i_sb, &block, &offset,
159 squashfs_i(inode)->dir_idx_start,
160 squashfs_i(inode)->dir_idx_offset,
161 squashfs_i(inode)->dir_idx_cnt,
162 file->f_pos);
163
164 while (length < i_size_read(inode)) {
165 /*
166 * Read directory header
167 */
168 err = squashfs_read_metadata(inode->i_sb, &dirh, &block,
169 &offset, sizeof(dirh));
170 if (err < 0)
171 goto failed_read;
172
173 length += sizeof(dirh);
174
175 dir_count = le32_to_cpu(dirh.count) + 1;
176 while (dir_count--) {
177 /*
178 * Read directory entry.
179 */
180 err = squashfs_read_metadata(inode->i_sb, dire, &block,
181 &offset, sizeof(*dire));
182 if (err < 0)
183 goto failed_read;
184
185 size = le16_to_cpu(dire->size) + 1;
186
187 err = squashfs_read_metadata(inode->i_sb, dire->name,
188 &block, &offset, size);
189 if (err < 0)
190 goto failed_read;
191
192 length += sizeof(*dire) + size;
193
194 if (file->f_pos >= length)
195 continue;
196
197 dire->name[size] = '\0';
198 inode_number = le32_to_cpu(dirh.inode_number) +
199 ((short) le16_to_cpu(dire->inode_number));
200 type = le16_to_cpu(dire->type);
201
202 TRACE("Calling filldir(%p, %s, %d, %lld, %x:%x, %d, %d)"
203 "\n", dirent, dire->name, size,
204 file->f_pos,
205 le32_to_cpu(dirh.start_block),
206 le16_to_cpu(dire->offset),
207 inode_number,
208 squashfs_filetype_table[type]);
209
210 if (filldir(dirent, dire->name, size, file->f_pos,
211 inode_number,
212 squashfs_filetype_table[type]) < 0) {
213 TRACE("Filldir returned less than 0\n");
214 goto finish;
215 }
216
217 file->f_pos = length;
218 }
219 }
220
221finish:
222 kfree(dire);
223 return 0;
224
225failed_read:
226 ERROR("Unable to read directory block [%llx:%x]\n", block, offset);
227 kfree(dire);
228 return 0;
229}
230
231
232const struct file_operations squashfs_dir_ops = {
233 .read = generic_read_dir,
234 .readdir = squashfs_readdir
235};
diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c
new file mode 100644
index 000000000000..69e971d5ddc1
--- /dev/null
+++ b/fs/squashfs/export.c
@@ -0,0 +1,155 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * export.c
22 */
23
24/*
25 * This file implements code to make Squashfs filesystems exportable (NFS etc.)
26 *
27 * The export code uses an inode lookup table to map inode numbers passed in
28 * filehandles to an inode location on disk. This table is stored compressed
29 * into metadata blocks. A second index table is used to locate these. This
30 * second index table for speed of access (and because it is small) is read at
31 * mount time and cached in memory.
32 *
33 * The inode lookup table is used only by the export code, inode disk
34 * locations are directly encoded in directories, enabling direct access
35 * without an intermediate lookup for all operations except the export ops.
36 */
37
38#include <linux/fs.h>
39#include <linux/vfs.h>
40#include <linux/dcache.h>
41#include <linux/exportfs.h>
42#include <linux/zlib.h>
43
44#include "squashfs_fs.h"
45#include "squashfs_fs_sb.h"
46#include "squashfs_fs_i.h"
47#include "squashfs.h"
48
49/*
50 * Look-up inode number (ino) in table, returning the inode location.
51 */
52static long long squashfs_inode_lookup(struct super_block *sb, int ino_num)
53{
54 struct squashfs_sb_info *msblk = sb->s_fs_info;
55 int blk = SQUASHFS_LOOKUP_BLOCK(ino_num - 1);
56 int offset = SQUASHFS_LOOKUP_BLOCK_OFFSET(ino_num - 1);
57 u64 start = le64_to_cpu(msblk->inode_lookup_table[blk]);
58 __le64 ino;
59 int err;
60
61 TRACE("Entered squashfs_inode_lookup, inode_number = %d\n", ino_num);
62
63 err = squashfs_read_metadata(sb, &ino, &start, &offset, sizeof(ino));
64 if (err < 0)
65 return err;
66
67 TRACE("squashfs_inode_lookup, inode = 0x%llx\n",
68 (u64) le64_to_cpu(ino));
69
70 return le64_to_cpu(ino);
71}
72
73
74static struct dentry *squashfs_export_iget(struct super_block *sb,
75 unsigned int ino_num)
76{
77 long long ino;
78 struct dentry *dentry = ERR_PTR(-ENOENT);
79
80 TRACE("Entered squashfs_export_iget\n");
81
82 ino = squashfs_inode_lookup(sb, ino_num);
83 if (ino >= 0)
84 dentry = d_obtain_alias(squashfs_iget(sb, ino, ino_num));
85
86 return dentry;
87}
88
89
90static struct dentry *squashfs_fh_to_dentry(struct super_block *sb,
91 struct fid *fid, int fh_len, int fh_type)
92{
93 if ((fh_type != FILEID_INO32_GEN && fh_type != FILEID_INO32_GEN_PARENT)
94 || fh_len < 2)
95 return NULL;
96
97 return squashfs_export_iget(sb, fid->i32.ino);
98}
99
100
101static struct dentry *squashfs_fh_to_parent(struct super_block *sb,
102 struct fid *fid, int fh_len, int fh_type)
103{
104 if (fh_type != FILEID_INO32_GEN_PARENT || fh_len < 4)
105 return NULL;
106
107 return squashfs_export_iget(sb, fid->i32.parent_ino);
108}
109
110
111static struct dentry *squashfs_get_parent(struct dentry *child)
112{
113 struct inode *inode = child->d_inode;
114 unsigned int parent_ino = squashfs_i(inode)->parent;
115
116 return squashfs_export_iget(inode->i_sb, parent_ino);
117}
118
119
120/*
121 * Read uncompressed inode lookup table indexes off disk into memory
122 */
123__le64 *squashfs_read_inode_lookup_table(struct super_block *sb,
124 u64 lookup_table_start, unsigned int inodes)
125{
126 unsigned int length = SQUASHFS_LOOKUP_BLOCK_BYTES(inodes);
127 __le64 *inode_lookup_table;
128 int err;
129
130 TRACE("In read_inode_lookup_table, length %d\n", length);
131
132 /* Allocate inode lookup table indexes */
133 inode_lookup_table = kmalloc(length, GFP_KERNEL);
134 if (inode_lookup_table == NULL) {
135 ERROR("Failed to allocate inode lookup table\n");
136 return ERR_PTR(-ENOMEM);
137 }
138
139 err = squashfs_read_table(sb, inode_lookup_table, lookup_table_start,
140 length);
141 if (err < 0) {
142 ERROR("unable to read inode lookup table\n");
143 kfree(inode_lookup_table);
144 return ERR_PTR(err);
145 }
146
147 return inode_lookup_table;
148}
149
150
151const struct export_operations squashfs_export_ops = {
152 .fh_to_dentry = squashfs_fh_to_dentry,
153 .fh_to_parent = squashfs_fh_to_parent,
154 .get_parent = squashfs_get_parent
155};
diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c
new file mode 100644
index 000000000000..717767d831df
--- /dev/null
+++ b/fs/squashfs/file.c
@@ -0,0 +1,502 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * file.c
22 */
23
24/*
25 * This file contains code for handling regular files. A regular file
26 * consists of a sequence of contiguous compressed blocks, and/or a
27 * compressed fragment block (tail-end packed block). The compressed size
28 * of each datablock is stored in a block list contained within the
29 * file inode (itself stored in one or more compressed metadata blocks).
30 *
31 * To speed up access to datablocks when reading 'large' files (256 Mbytes or
32 * larger), the code implements an index cache that caches the mapping from
33 * block index to datablock location on disk.
34 *
35 * The index cache allows Squashfs to handle large files (up to 1.75 TiB) while
36 * retaining a simple and space-efficient block list on disk. The cache
37 * is split into slots, caching up to eight 224 GiB files (128 KiB blocks).
38 * Larger files use multiple slots, with 1.75 TiB files using all 8 slots.
39 * The index cache is designed to be memory efficient, and by default uses
40 * 16 KiB.
41 */
42
43#include <linux/fs.h>
44#include <linux/vfs.h>
45#include <linux/kernel.h>
46#include <linux/slab.h>
47#include <linux/string.h>
48#include <linux/pagemap.h>
49#include <linux/mutex.h>
50#include <linux/zlib.h>
51
52#include "squashfs_fs.h"
53#include "squashfs_fs_sb.h"
54#include "squashfs_fs_i.h"
55#include "squashfs.h"
56
57/*
58 * Locate cache slot in range [offset, index] for specified inode. If
59 * there's more than one return the slot closest to index.
60 */
61static struct meta_index *locate_meta_index(struct inode *inode, int offset,
62 int index)
63{
64 struct meta_index *meta = NULL;
65 struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
66 int i;
67
68 mutex_lock(&msblk->meta_index_mutex);
69
70 TRACE("locate_meta_index: index %d, offset %d\n", index, offset);
71
72 if (msblk->meta_index == NULL)
73 goto not_allocated;
74
75 for (i = 0; i < SQUASHFS_META_SLOTS; i++) {
76 if (msblk->meta_index[i].inode_number == inode->i_ino &&
77 msblk->meta_index[i].offset >= offset &&
78 msblk->meta_index[i].offset <= index &&
79 msblk->meta_index[i].locked == 0) {
80 TRACE("locate_meta_index: entry %d, offset %d\n", i,
81 msblk->meta_index[i].offset);
82 meta = &msblk->meta_index[i];
83 offset = meta->offset;
84 }
85 }
86
87 if (meta)
88 meta->locked = 1;
89
90not_allocated:
91 mutex_unlock(&msblk->meta_index_mutex);
92
93 return meta;
94}
95
96
97/*
98 * Find and initialise an empty cache slot for index offset.
99 */
100static struct meta_index *empty_meta_index(struct inode *inode, int offset,
101 int skip)
102{
103 struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
104 struct meta_index *meta = NULL;
105 int i;
106
107 mutex_lock(&msblk->meta_index_mutex);
108
109 TRACE("empty_meta_index: offset %d, skip %d\n", offset, skip);
110
111 if (msblk->meta_index == NULL) {
112 /*
113 * First time cache index has been used, allocate and
114 * initialise. The cache index could be allocated at
115 * mount time but doing it here means it is allocated only
116 * if a 'large' file is read.
117 */
118 msblk->meta_index = kcalloc(SQUASHFS_META_SLOTS,
119 sizeof(*(msblk->meta_index)), GFP_KERNEL);
120 if (msblk->meta_index == NULL) {
121 ERROR("Failed to allocate meta_index\n");
122 goto failed;
123 }
124 for (i = 0; i < SQUASHFS_META_SLOTS; i++) {
125 msblk->meta_index[i].inode_number = 0;
126 msblk->meta_index[i].locked = 0;
127 }
128 msblk->next_meta_index = 0;
129 }
130
131 for (i = SQUASHFS_META_SLOTS; i &&
132 msblk->meta_index[msblk->next_meta_index].locked; i--)
133 msblk->next_meta_index = (msblk->next_meta_index + 1) %
134 SQUASHFS_META_SLOTS;
135
136 if (i == 0) {
137 TRACE("empty_meta_index: failed!\n");
138 goto failed;
139 }
140
141 TRACE("empty_meta_index: returned meta entry %d, %p\n",
142 msblk->next_meta_index,
143 &msblk->meta_index[msblk->next_meta_index]);
144
145 meta = &msblk->meta_index[msblk->next_meta_index];
146 msblk->next_meta_index = (msblk->next_meta_index + 1) %
147 SQUASHFS_META_SLOTS;
148
149 meta->inode_number = inode->i_ino;
150 meta->offset = offset;
151 meta->skip = skip;
152 meta->entries = 0;
153 meta->locked = 1;
154
155failed:
156 mutex_unlock(&msblk->meta_index_mutex);
157 return meta;
158}
159
160
161static void release_meta_index(struct inode *inode, struct meta_index *meta)
162{
163 struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
164 mutex_lock(&msblk->meta_index_mutex);
165 meta->locked = 0;
166 mutex_unlock(&msblk->meta_index_mutex);
167}
168
169
170/*
171 * Read the next n blocks from the block list, starting from
172 * metadata block <start_block, offset>.
173 */
174static long long read_indexes(struct super_block *sb, int n,
175 u64 *start_block, int *offset)
176{
177 int err, i;
178 long long block = 0;
179 __le32 *blist = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL);
180
181 if (blist == NULL) {
182 ERROR("read_indexes: Failed to allocate block_list\n");
183 return -ENOMEM;
184 }
185
186 while (n) {
187 int blocks = min_t(int, n, PAGE_CACHE_SIZE >> 2);
188
189 err = squashfs_read_metadata(sb, blist, start_block,
190 offset, blocks << 2);
191 if (err < 0) {
192 ERROR("read_indexes: reading block [%llx:%x]\n",
193 *start_block, *offset);
194 goto failure;
195 }
196
197 for (i = 0; i < blocks; i++) {
198 int size = le32_to_cpu(blist[i]);
199 block += SQUASHFS_COMPRESSED_SIZE_BLOCK(size);
200 }
201 n -= blocks;
202 }
203
204 kfree(blist);
205 return block;
206
207failure:
208 kfree(blist);
209 return err;
210}
211
212
213/*
214 * Each cache index slot has SQUASHFS_META_ENTRIES, each of which
215 * can cache one index -> datablock/blocklist-block mapping. We wish
216 * to distribute these over the length of the file, entry[0] maps index x,
217 * entry[1] maps index x + skip, entry[2] maps index x + 2 * skip, and so on.
218 * The larger the file, the greater the skip factor. The skip factor is
219 * limited to the size of the metadata cache (SQUASHFS_CACHED_BLKS) to ensure
220 * the number of metadata blocks that need to be read fits into the cache.
221 * If the skip factor is limited in this way then the file will use multiple
222 * slots.
223 */
224static inline int calculate_skip(int blocks)
225{
226 int skip = blocks / ((SQUASHFS_META_ENTRIES + 1)
227 * SQUASHFS_META_INDEXES);
228 return min(SQUASHFS_CACHED_BLKS - 1, skip + 1);
229}
230
231
232/*
233 * Search and grow the index cache for the specified inode, returning the
234 * on-disk locations of the datablock and block list metadata block
235 * <index_block, index_offset> for index (scaled to nearest cache index).
236 */
237static int fill_meta_index(struct inode *inode, int index,
238 u64 *index_block, int *index_offset, u64 *data_block)
239{
240 struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
241 int skip = calculate_skip(i_size_read(inode) >> msblk->block_log);
242 int offset = 0;
243 struct meta_index *meta;
244 struct meta_entry *meta_entry;
245 u64 cur_index_block = squashfs_i(inode)->block_list_start;
246 int cur_offset = squashfs_i(inode)->offset;
247 u64 cur_data_block = squashfs_i(inode)->start;
248 int err, i;
249
250 /*
251 * Scale index to cache index (cache slot entry)
252 */
253 index /= SQUASHFS_META_INDEXES * skip;
254
255 while (offset < index) {
256 meta = locate_meta_index(inode, offset + 1, index);
257
258 if (meta == NULL) {
259 meta = empty_meta_index(inode, offset + 1, skip);
260 if (meta == NULL)
261 goto all_done;
262 } else {
263 offset = index < meta->offset + meta->entries ? index :
264 meta->offset + meta->entries - 1;
265 meta_entry = &meta->meta_entry[offset - meta->offset];
266 cur_index_block = meta_entry->index_block +
267 msblk->inode_table;
268 cur_offset = meta_entry->offset;
269 cur_data_block = meta_entry->data_block;
270 TRACE("get_meta_index: offset %d, meta->offset %d, "
271 "meta->entries %d\n", offset, meta->offset,
272 meta->entries);
273 TRACE("get_meta_index: index_block 0x%llx, offset 0x%x"
274 " data_block 0x%llx\n", cur_index_block,
275 cur_offset, cur_data_block);
276 }
277
278 /*
279 * If necessary grow cache slot by reading block list. Cache
280 * slot is extended up to index or to the end of the slot, in
281 * which case further slots will be used.
282 */
283 for (i = meta->offset + meta->entries; i <= index &&
284 i < meta->offset + SQUASHFS_META_ENTRIES; i++) {
285 int blocks = skip * SQUASHFS_META_INDEXES;
286 long long res = read_indexes(inode->i_sb, blocks,
287 &cur_index_block, &cur_offset);
288
289 if (res < 0) {
290 if (meta->entries == 0)
291 /*
292 * Don't leave an empty slot on read
293 * error allocated to this inode...
294 */
295 meta->inode_number = 0;
296 err = res;
297 goto failed;
298 }
299
300 cur_data_block += res;
301 meta_entry = &meta->meta_entry[i - meta->offset];
302 meta_entry->index_block = cur_index_block -
303 msblk->inode_table;
304 meta_entry->offset = cur_offset;
305 meta_entry->data_block = cur_data_block;
306 meta->entries++;
307 offset++;
308 }
309
310 TRACE("get_meta_index: meta->offset %d, meta->entries %d\n",
311 meta->offset, meta->entries);
312
313 release_meta_index(inode, meta);
314 }
315
316all_done:
317 *index_block = cur_index_block;
318 *index_offset = cur_offset;
319 *data_block = cur_data_block;
320
321 /*
322 * Scale cache index (cache slot entry) to index
323 */
324 return offset * SQUASHFS_META_INDEXES * skip;
325
326failed:
327 release_meta_index(inode, meta);
328 return err;
329}
330
331
332/*
333 * Get the on-disk location and compressed size of the datablock
334 * specified by index. Fill_meta_index() does most of the work.
335 */
336static int read_blocklist(struct inode *inode, int index, u64 *block)
337{
338 u64 start;
339 long long blks;
340 int offset;
341 __le32 size;
342 int res = fill_meta_index(inode, index, &start, &offset, block);
343
344 TRACE("read_blocklist: res %d, index %d, start 0x%llx, offset"
345 " 0x%x, block 0x%llx\n", res, index, start, offset,
346 *block);
347
348 if (res < 0)
349 return res;
350
351 /*
352 * res contains the index of the mapping returned by fill_meta_index(),
353 * this will likely be less than the desired index (because the
354 * meta_index cache works at a higher granularity). Read any
355 * extra block indexes needed.
356 */
357 if (res < index) {
358 blks = read_indexes(inode->i_sb, index - res, &start, &offset);
359 if (blks < 0)
360 return (int) blks;
361 *block += blks;
362 }
363
364 /*
365 * Read length of block specified by index.
366 */
367 res = squashfs_read_metadata(inode->i_sb, &size, &start, &offset,
368 sizeof(size));
369 if (res < 0)
370 return res;
371 return le32_to_cpu(size);
372}
373
374
375static int squashfs_readpage(struct file *file, struct page *page)
376{
377 struct inode *inode = page->mapping->host;
378 struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
379 int bytes, i, offset = 0, sparse = 0;
380 struct squashfs_cache_entry *buffer = NULL;
381 void *pageaddr;
382
383 int mask = (1 << (msblk->block_log - PAGE_CACHE_SHIFT)) - 1;
384 int index = page->index >> (msblk->block_log - PAGE_CACHE_SHIFT);
385 int start_index = page->index & ~mask;
386 int end_index = start_index | mask;
387 int file_end = i_size_read(inode) >> msblk->block_log;
388
389 TRACE("Entered squashfs_readpage, page index %lx, start block %llx\n",
390 page->index, squashfs_i(inode)->start);
391
392 if (page->index >= ((i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
393 PAGE_CACHE_SHIFT))
394 goto out;
395
396 if (index < file_end || squashfs_i(inode)->fragment_block ==
397 SQUASHFS_INVALID_BLK) {
398 /*
399 * Reading a datablock from disk. Need to read block list
400 * to get location and block size.
401 */
402 u64 block = 0;
403 int bsize = read_blocklist(inode, index, &block);
404 if (bsize < 0)
405 goto error_out;
406
407 if (bsize == 0) { /* hole */
408 bytes = index == file_end ?
409 (i_size_read(inode) & (msblk->block_size - 1)) :
410 msblk->block_size;
411 sparse = 1;
412 } else {
413 /*
414 * Read and decompress datablock.
415 */
416 buffer = squashfs_get_datablock(inode->i_sb,
417 block, bsize);
418 if (buffer->error) {
419 ERROR("Unable to read page, block %llx, size %x"
420 "\n", block, bsize);
421 squashfs_cache_put(buffer);
422 goto error_out;
423 }
424 bytes = buffer->length;
425 }
426 } else {
427 /*
428 * Datablock is stored inside a fragment (tail-end packed
429 * block).
430 */
431 buffer = squashfs_get_fragment(inode->i_sb,
432 squashfs_i(inode)->fragment_block,
433 squashfs_i(inode)->fragment_size);
434
435 if (buffer->error) {
436 ERROR("Unable to read page, block %llx, size %x\n",
437 squashfs_i(inode)->fragment_block,
438 squashfs_i(inode)->fragment_size);
439 squashfs_cache_put(buffer);
440 goto error_out;
441 }
442 bytes = i_size_read(inode) & (msblk->block_size - 1);
443 offset = squashfs_i(inode)->fragment_offset;
444 }
445
446 /*
447 * Loop copying datablock into pages. As the datablock likely covers
448 * many PAGE_CACHE_SIZE pages (default block size is 128 KiB) explicitly
449 * grab the pages from the page cache, except for the page that we've
450 * been called to fill.
451 */
452 for (i = start_index; i <= end_index && bytes > 0; i++,
453 bytes -= PAGE_CACHE_SIZE, offset += PAGE_CACHE_SIZE) {
454 struct page *push_page;
455 int avail = sparse ? 0 : min_t(int, bytes, PAGE_CACHE_SIZE);
456
457 TRACE("bytes %d, i %d, available_bytes %d\n", bytes, i, avail);
458
459 push_page = (i == page->index) ? page :
460 grab_cache_page_nowait(page->mapping, i);
461
462 if (!push_page)
463 continue;
464
465 if (PageUptodate(push_page))
466 goto skip_page;
467
468 pageaddr = kmap_atomic(push_page, KM_USER0);
469 squashfs_copy_data(pageaddr, buffer, offset, avail);
470 memset(pageaddr + avail, 0, PAGE_CACHE_SIZE - avail);
471 kunmap_atomic(pageaddr, KM_USER0);
472 flush_dcache_page(push_page);
473 SetPageUptodate(push_page);
474skip_page:
475 unlock_page(push_page);
476 if (i != page->index)
477 page_cache_release(push_page);
478 }
479
480 if (!sparse)
481 squashfs_cache_put(buffer);
482
483 return 0;
484
485error_out:
486 SetPageError(page);
487out:
488 pageaddr = kmap_atomic(page, KM_USER0);
489 memset(pageaddr, 0, PAGE_CACHE_SIZE);
490 kunmap_atomic(pageaddr, KM_USER0);
491 flush_dcache_page(page);
492 if (!PageError(page))
493 SetPageUptodate(page);
494 unlock_page(page);
495
496 return 0;
497}
498
499
500const struct address_space_operations squashfs_aops = {
501 .readpage = squashfs_readpage
502};
diff --git a/fs/squashfs/fragment.c b/fs/squashfs/fragment.c
new file mode 100644
index 000000000000..b5a2c15bbbc7
--- /dev/null
+++ b/fs/squashfs/fragment.c
@@ -0,0 +1,98 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * fragment.c
22 */
23
24/*
25 * This file implements code to handle compressed fragments (tail-end packed
26 * datablocks).
27 *
28 * Regular files contain a fragment index which is mapped to a fragment
29 * location on disk and compressed size using a fragment lookup table.
30 * Like everything in Squashfs this fragment lookup table is itself stored
31 * compressed into metadata blocks. A second index table is used to locate
32 * these. This second index table for speed of access (and because it
33 * is small) is read at mount time and cached in memory.
34 */
35
36#include <linux/fs.h>
37#include <linux/vfs.h>
38#include <linux/slab.h>
39#include <linux/zlib.h>
40
41#include "squashfs_fs.h"
42#include "squashfs_fs_sb.h"
43#include "squashfs_fs_i.h"
44#include "squashfs.h"
45
46/*
47 * Look-up fragment using the fragment index table. Return the on disk
48 * location of the fragment and its compressed size
49 */
50int squashfs_frag_lookup(struct super_block *sb, unsigned int fragment,
51 u64 *fragment_block)
52{
53 struct squashfs_sb_info *msblk = sb->s_fs_info;
54 int block = SQUASHFS_FRAGMENT_INDEX(fragment);
55 int offset = SQUASHFS_FRAGMENT_INDEX_OFFSET(fragment);
56 u64 start_block = le64_to_cpu(msblk->fragment_index[block]);
57 struct squashfs_fragment_entry fragment_entry;
58 int size;
59
60 size = squashfs_read_metadata(sb, &fragment_entry, &start_block,
61 &offset, sizeof(fragment_entry));
62 if (size < 0)
63 return size;
64
65 *fragment_block = le64_to_cpu(fragment_entry.start_block);
66 size = le32_to_cpu(fragment_entry.size);
67
68 return size;
69}
70
71
72/*
73 * Read the uncompressed fragment lookup table indexes off disk into memory
74 */
75__le64 *squashfs_read_fragment_index_table(struct super_block *sb,
76 u64 fragment_table_start, unsigned int fragments)
77{
78 unsigned int length = SQUASHFS_FRAGMENT_INDEX_BYTES(fragments);
79 __le64 *fragment_index;
80 int err;
81
82 /* Allocate fragment lookup table indexes */
83 fragment_index = kmalloc(length, GFP_KERNEL);
84 if (fragment_index == NULL) {
85 ERROR("Failed to allocate fragment index table\n");
86 return ERR_PTR(-ENOMEM);
87 }
88
89 err = squashfs_read_table(sb, fragment_index, fragment_table_start,
90 length);
91 if (err < 0) {
92 ERROR("unable to read fragment index table\n");
93 kfree(fragment_index);
94 return ERR_PTR(err);
95 }
96
97 return fragment_index;
98}
diff --git a/fs/squashfs/id.c b/fs/squashfs/id.c
new file mode 100644
index 000000000000..3795b837ba28
--- /dev/null
+++ b/fs/squashfs/id.c
@@ -0,0 +1,94 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * id.c
22 */
23
24/*
25 * This file implements code to handle uids and gids.
26 *
27 * For space efficiency regular files store uid and gid indexes, which are
28 * converted to 32-bit uids/gids using an id look up table. This table is
29 * stored compressed into metadata blocks. A second index table is used to
30 * locate these. This second index table for speed of access (and because it
31 * is small) is read at mount time and cached in memory.
32 */
33
34#include <linux/fs.h>
35#include <linux/vfs.h>
36#include <linux/slab.h>
37#include <linux/zlib.h>
38
39#include "squashfs_fs.h"
40#include "squashfs_fs_sb.h"
41#include "squashfs_fs_i.h"
42#include "squashfs.h"
43
44/*
45 * Map uid/gid index into real 32-bit uid/gid using the id look up table
46 */
47int squashfs_get_id(struct super_block *sb, unsigned int index,
48 unsigned int *id)
49{
50 struct squashfs_sb_info *msblk = sb->s_fs_info;
51 int block = SQUASHFS_ID_BLOCK(index);
52 int offset = SQUASHFS_ID_BLOCK_OFFSET(index);
53 u64 start_block = le64_to_cpu(msblk->id_table[block]);
54 __le32 disk_id;
55 int err;
56
57 err = squashfs_read_metadata(sb, &disk_id, &start_block, &offset,
58 sizeof(disk_id));
59 if (err < 0)
60 return err;
61
62 *id = le32_to_cpu(disk_id);
63 return 0;
64}
65
66
67/*
68 * Read uncompressed id lookup table indexes from disk into memory
69 */
70__le64 *squashfs_read_id_index_table(struct super_block *sb,
71 u64 id_table_start, unsigned short no_ids)
72{
73 unsigned int length = SQUASHFS_ID_BLOCK_BYTES(no_ids);
74 __le64 *id_table;
75 int err;
76
77 TRACE("In read_id_index_table, length %d\n", length);
78
79 /* Allocate id lookup table indexes */
80 id_table = kmalloc(length, GFP_KERNEL);
81 if (id_table == NULL) {
82 ERROR("Failed to allocate id index table\n");
83 return ERR_PTR(-ENOMEM);
84 }
85
86 err = squashfs_read_table(sb, id_table, id_table_start, length);
87 if (err < 0) {
88 ERROR("unable to read id index table\n");
89 kfree(id_table);
90 return ERR_PTR(err);
91 }
92
93 return id_table;
94}
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
new file mode 100644
index 000000000000..7a63398bb855
--- /dev/null
+++ b/fs/squashfs/inode.c
@@ -0,0 +1,346 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * inode.c
22 */
23
24/*
25 * This file implements code to create and read inodes from disk.
26 *
27 * Inodes in Squashfs are identified by a 48-bit inode which encodes the
28 * location of the compressed metadata block containing the inode, and the byte
29 * offset into that block where the inode is placed (<block, offset>).
30 *
31 * To maximise compression there are different inodes for each file type
32 * (regular file, directory, device, etc.), the inode contents and length
33 * varying with the type.
34 *
35 * To further maximise compression, two types of regular file inode and
36 * directory inode are defined: inodes optimised for frequently occurring
37 * regular files and directories, and extended types where extra
38 * information has to be stored.
39 */
40
41#include <linux/fs.h>
42#include <linux/vfs.h>
43#include <linux/zlib.h>
44
45#include "squashfs_fs.h"
46#include "squashfs_fs_sb.h"
47#include "squashfs_fs_i.h"
48#include "squashfs.h"
49
50/*
51 * Initialise VFS inode with the base inode information common to all
52 * Squashfs inode types. Sqsh_ino contains the unswapped base inode
53 * off disk.
54 */
55static int squashfs_new_inode(struct super_block *sb, struct inode *inode,
56 struct squashfs_base_inode *sqsh_ino)
57{
58 int err;
59
60 err = squashfs_get_id(sb, le16_to_cpu(sqsh_ino->uid), &inode->i_uid);
61 if (err)
62 return err;
63
64 err = squashfs_get_id(sb, le16_to_cpu(sqsh_ino->guid), &inode->i_gid);
65 if (err)
66 return err;
67
68 inode->i_ino = le32_to_cpu(sqsh_ino->inode_number);
69 inode->i_mtime.tv_sec = le32_to_cpu(sqsh_ino->mtime);
70 inode->i_atime.tv_sec = inode->i_mtime.tv_sec;
71 inode->i_ctime.tv_sec = inode->i_mtime.tv_sec;
72 inode->i_mode = le16_to_cpu(sqsh_ino->mode);
73 inode->i_size = 0;
74
75 return err;
76}
77
78
79struct inode *squashfs_iget(struct super_block *sb, long long ino,
80 unsigned int ino_number)
81{
82 struct inode *inode = iget_locked(sb, ino_number);
83 int err;
84
85 TRACE("Entered squashfs_iget\n");
86
87 if (!inode)
88 return ERR_PTR(-ENOMEM);
89 if (!(inode->i_state & I_NEW))
90 return inode;
91
92 err = squashfs_read_inode(inode, ino);
93 if (err) {
94 iget_failed(inode);
95 return ERR_PTR(err);
96 }
97
98 unlock_new_inode(inode);
99 return inode;
100}
101
102
103/*
104 * Initialise VFS inode by reading inode from inode table (compressed
105 * metadata). The format and amount of data read depends on type.
106 */
107int squashfs_read_inode(struct inode *inode, long long ino)
108{
109 struct super_block *sb = inode->i_sb;
110 struct squashfs_sb_info *msblk = sb->s_fs_info;
111 u64 block = SQUASHFS_INODE_BLK(ino) + msblk->inode_table;
112 int err, type, offset = SQUASHFS_INODE_OFFSET(ino);
113 union squashfs_inode squashfs_ino;
114 struct squashfs_base_inode *sqshb_ino = &squashfs_ino.base;
115
116 TRACE("Entered squashfs_read_inode\n");
117
118 /*
119 * Read inode base common to all inode types.
120 */
121 err = squashfs_read_metadata(sb, sqshb_ino, &block,
122 &offset, sizeof(*sqshb_ino));
123 if (err < 0)
124 goto failed_read;
125
126 err = squashfs_new_inode(sb, inode, sqshb_ino);
127 if (err)
128 goto failed_read;
129
130 block = SQUASHFS_INODE_BLK(ino) + msblk->inode_table;
131 offset = SQUASHFS_INODE_OFFSET(ino);
132
133 type = le16_to_cpu(sqshb_ino->inode_type);
134 switch (type) {
135 case SQUASHFS_REG_TYPE: {
136 unsigned int frag_offset, frag_size, frag;
137 u64 frag_blk;
138 struct squashfs_reg_inode *sqsh_ino = &squashfs_ino.reg;
139
140 err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
141 sizeof(*sqsh_ino));
142 if (err < 0)
143 goto failed_read;
144
145 frag = le32_to_cpu(sqsh_ino->fragment);
146 if (frag != SQUASHFS_INVALID_FRAG) {
147 frag_offset = le32_to_cpu(sqsh_ino->offset);
148 frag_size = squashfs_frag_lookup(sb, frag, &frag_blk);
149 if (frag_size < 0) {
150 err = frag_size;
151 goto failed_read;
152 }
153 } else {
154 frag_blk = SQUASHFS_INVALID_BLK;
155 frag_size = 0;
156 frag_offset = 0;
157 }
158
159 inode->i_nlink = 1;
160 inode->i_size = le32_to_cpu(sqsh_ino->file_size);
161 inode->i_fop = &generic_ro_fops;
162 inode->i_mode |= S_IFREG;
163 inode->i_blocks = ((inode->i_size - 1) >> 9) + 1;
164 squashfs_i(inode)->fragment_block = frag_blk;
165 squashfs_i(inode)->fragment_size = frag_size;
166 squashfs_i(inode)->fragment_offset = frag_offset;
167 squashfs_i(inode)->start = le32_to_cpu(sqsh_ino->start_block);
168 squashfs_i(inode)->block_list_start = block;
169 squashfs_i(inode)->offset = offset;
170 inode->i_data.a_ops = &squashfs_aops;
171
172 TRACE("File inode %x:%x, start_block %llx, block_list_start "
173 "%llx, offset %x\n", SQUASHFS_INODE_BLK(ino),
174 offset, squashfs_i(inode)->start, block, offset);
175 break;
176 }
177 case SQUASHFS_LREG_TYPE: {
178 unsigned int frag_offset, frag_size, frag;
179 u64 frag_blk;
180 struct squashfs_lreg_inode *sqsh_ino = &squashfs_ino.lreg;
181
182 err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
183 sizeof(*sqsh_ino));
184 if (err < 0)
185 goto failed_read;
186
187 frag = le32_to_cpu(sqsh_ino->fragment);
188 if (frag != SQUASHFS_INVALID_FRAG) {
189 frag_offset = le32_to_cpu(sqsh_ino->offset);
190 frag_size = squashfs_frag_lookup(sb, frag, &frag_blk);
191 if (frag_size < 0) {
192 err = frag_size;
193 goto failed_read;
194 }
195 } else {
196 frag_blk = SQUASHFS_INVALID_BLK;
197 frag_size = 0;
198 frag_offset = 0;
199 }
200
201 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
202 inode->i_size = le64_to_cpu(sqsh_ino->file_size);
203 inode->i_fop = &generic_ro_fops;
204 inode->i_mode |= S_IFREG;
205 inode->i_blocks = ((inode->i_size -
206 le64_to_cpu(sqsh_ino->sparse) - 1) >> 9) + 1;
207
208 squashfs_i(inode)->fragment_block = frag_blk;
209 squashfs_i(inode)->fragment_size = frag_size;
210 squashfs_i(inode)->fragment_offset = frag_offset;
211 squashfs_i(inode)->start = le64_to_cpu(sqsh_ino->start_block);
212 squashfs_i(inode)->block_list_start = block;
213 squashfs_i(inode)->offset = offset;
214 inode->i_data.a_ops = &squashfs_aops;
215
216 TRACE("File inode %x:%x, start_block %llx, block_list_start "
217 "%llx, offset %x\n", SQUASHFS_INODE_BLK(ino),
218 offset, squashfs_i(inode)->start, block, offset);
219 break;
220 }
221 case SQUASHFS_DIR_TYPE: {
222 struct squashfs_dir_inode *sqsh_ino = &squashfs_ino.dir;
223
224 err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
225 sizeof(*sqsh_ino));
226 if (err < 0)
227 goto failed_read;
228
229 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
230 inode->i_size = le16_to_cpu(sqsh_ino->file_size);
231 inode->i_op = &squashfs_dir_inode_ops;
232 inode->i_fop = &squashfs_dir_ops;
233 inode->i_mode |= S_IFDIR;
234 squashfs_i(inode)->start = le32_to_cpu(sqsh_ino->start_block);
235 squashfs_i(inode)->offset = le16_to_cpu(sqsh_ino->offset);
236 squashfs_i(inode)->dir_idx_cnt = 0;
237 squashfs_i(inode)->parent = le32_to_cpu(sqsh_ino->parent_inode);
238
239 TRACE("Directory inode %x:%x, start_block %llx, offset %x\n",
240 SQUASHFS_INODE_BLK(ino), offset,
241 squashfs_i(inode)->start,
242 le16_to_cpu(sqsh_ino->offset));
243 break;
244 }
245 case SQUASHFS_LDIR_TYPE: {
246 struct squashfs_ldir_inode *sqsh_ino = &squashfs_ino.ldir;
247
248 err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
249 sizeof(*sqsh_ino));
250 if (err < 0)
251 goto failed_read;
252
253 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
254 inode->i_size = le32_to_cpu(sqsh_ino->file_size);
255 inode->i_op = &squashfs_dir_inode_ops;
256 inode->i_fop = &squashfs_dir_ops;
257 inode->i_mode |= S_IFDIR;
258 squashfs_i(inode)->start = le32_to_cpu(sqsh_ino->start_block);
259 squashfs_i(inode)->offset = le16_to_cpu(sqsh_ino->offset);
260 squashfs_i(inode)->dir_idx_start = block;
261 squashfs_i(inode)->dir_idx_offset = offset;
262 squashfs_i(inode)->dir_idx_cnt = le16_to_cpu(sqsh_ino->i_count);
263 squashfs_i(inode)->parent = le32_to_cpu(sqsh_ino->parent_inode);
264
265 TRACE("Long directory inode %x:%x, start_block %llx, offset "
266 "%x\n", SQUASHFS_INODE_BLK(ino), offset,
267 squashfs_i(inode)->start,
268 le16_to_cpu(sqsh_ino->offset));
269 break;
270 }
271 case SQUASHFS_SYMLINK_TYPE:
272 case SQUASHFS_LSYMLINK_TYPE: {
273 struct squashfs_symlink_inode *sqsh_ino = &squashfs_ino.symlink;
274
275 err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
276 sizeof(*sqsh_ino));
277 if (err < 0)
278 goto failed_read;
279
280 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
281 inode->i_size = le32_to_cpu(sqsh_ino->symlink_size);
282 inode->i_op = &page_symlink_inode_operations;
283 inode->i_data.a_ops = &squashfs_symlink_aops;
284 inode->i_mode |= S_IFLNK;
285 squashfs_i(inode)->start = block;
286 squashfs_i(inode)->offset = offset;
287
288 TRACE("Symbolic link inode %x:%x, start_block %llx, offset "
289 "%x\n", SQUASHFS_INODE_BLK(ino), offset,
290 block, offset);
291 break;
292 }
293 case SQUASHFS_BLKDEV_TYPE:
294 case SQUASHFS_CHRDEV_TYPE:
295 case SQUASHFS_LBLKDEV_TYPE:
296 case SQUASHFS_LCHRDEV_TYPE: {
297 struct squashfs_dev_inode *sqsh_ino = &squashfs_ino.dev;
298 unsigned int rdev;
299
300 err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
301 sizeof(*sqsh_ino));
302 if (err < 0)
303 goto failed_read;
304
305 if (type == SQUASHFS_CHRDEV_TYPE)
306 inode->i_mode |= S_IFCHR;
307 else
308 inode->i_mode |= S_IFBLK;
309 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
310 rdev = le32_to_cpu(sqsh_ino->rdev);
311 init_special_inode(inode, inode->i_mode, new_decode_dev(rdev));
312
313 TRACE("Device inode %x:%x, rdev %x\n",
314 SQUASHFS_INODE_BLK(ino), offset, rdev);
315 break;
316 }
317 case SQUASHFS_FIFO_TYPE:
318 case SQUASHFS_SOCKET_TYPE:
319 case SQUASHFS_LFIFO_TYPE:
320 case SQUASHFS_LSOCKET_TYPE: {
321 struct squashfs_ipc_inode *sqsh_ino = &squashfs_ino.ipc;
322
323 err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
324 sizeof(*sqsh_ino));
325 if (err < 0)
326 goto failed_read;
327
328 if (type == SQUASHFS_FIFO_TYPE)
329 inode->i_mode |= S_IFIFO;
330 else
331 inode->i_mode |= S_IFSOCK;
332 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
333 init_special_inode(inode, inode->i_mode, 0);
334 break;
335 }
336 default:
337 ERROR("Unknown inode type %d in squashfs_iget!\n", type);
338 return -EINVAL;
339 }
340
341 return 0;
342
343failed_read:
344 ERROR("Unable to read inode 0x%llx\n", ino);
345 return err;
346}
diff --git a/fs/squashfs/namei.c b/fs/squashfs/namei.c
new file mode 100644
index 000000000000..9e398653b22b
--- /dev/null
+++ b/fs/squashfs/namei.c
@@ -0,0 +1,242 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * namei.c
22 */
23
24/*
25 * This file implements code to do filename lookup in directories.
26 *
27 * Like inodes, directories are packed into compressed metadata blocks, stored
28 * in a directory table. Directories are accessed using the start address of
29 * the metablock containing the directory and the offset into the
30 * decompressed block (<block, offset>).
31 *
32 * Directories are organised in a slightly complex way, and are not simply
33 * a list of file names. The organisation takes advantage of the
34 * fact that (in most cases) the inodes of the files will be in the same
35 * compressed metadata block, and therefore, can share the start block.
36 * Directories are therefore organised in a two level list, a directory
37 * header containing the shared start block value, and a sequence of directory
38 * entries, each of which share the shared start block. A new directory header
39 * is written once/if the inode start block changes. The directory
40 * header/directory entry list is repeated as many times as necessary.
41 *
42 * Directories are sorted, and can contain a directory index to speed up
43 * file lookup. Directory indexes store one entry per metablock, each entry
44 * storing the index/filename mapping to the first directory header
45 * in each metadata block. Directories are sorted in alphabetical order,
46 * and at lookup the index is scanned linearly looking for the first filename
47 * alphabetically larger than the filename being looked up. At this point the
48 * location of the metadata block the filename is in has been found.
49 * The general idea of the index is ensure only one metadata block needs to be
50 * decompressed to do a lookup irrespective of the length of the directory.
51 * This scheme has the advantage that it doesn't require extra memory overhead
52 * and doesn't require much extra storage on disk.
53 */
54
55#include <linux/fs.h>
56#include <linux/vfs.h>
57#include <linux/slab.h>
58#include <linux/string.h>
59#include <linux/dcache.h>
60#include <linux/zlib.h>
61
62#include "squashfs_fs.h"
63#include "squashfs_fs_sb.h"
64#include "squashfs_fs_i.h"
65#include "squashfs.h"
66
67/*
68 * Lookup name in the directory index, returning the location of the metadata
69 * block containing it, and the directory index this represents.
70 *
71 * If we get an error reading the index then return the part of the index
72 * (if any) we have managed to read - the index isn't essential, just
73 * quicker.
74 */
75static int get_dir_index_using_name(struct super_block *sb,
76 u64 *next_block, int *next_offset, u64 index_start,
77 int index_offset, int i_count, const char *name,
78 int len)
79{
80 struct squashfs_sb_info *msblk = sb->s_fs_info;
81 int i, size, length = 0, err;
82 struct squashfs_dir_index *index;
83 char *str;
84
85 TRACE("Entered get_dir_index_using_name, i_count %d\n", i_count);
86
87 index = kmalloc(sizeof(*index) + SQUASHFS_NAME_LEN * 2 + 2, GFP_KERNEL);
88 if (index == NULL) {
89 ERROR("Failed to allocate squashfs_dir_index\n");
90 goto out;
91 }
92
93 str = &index->name[SQUASHFS_NAME_LEN + 1];
94 strncpy(str, name, len);
95 str[len] = '\0';
96
97 for (i = 0; i < i_count; i++) {
98 err = squashfs_read_metadata(sb, index, &index_start,
99 &index_offset, sizeof(*index));
100 if (err < 0)
101 break;
102
103
104 size = le32_to_cpu(index->size) + 1;
105
106 err = squashfs_read_metadata(sb, index->name, &index_start,
107 &index_offset, size);
108 if (err < 0)
109 break;
110
111 index->name[size] = '\0';
112
113 if (strcmp(index->name, str) > 0)
114 break;
115
116 length = le32_to_cpu(index->index);
117 *next_block = le32_to_cpu(index->start_block) +
118 msblk->directory_table;
119 }
120
121 *next_offset = (length + *next_offset) % SQUASHFS_METADATA_SIZE;
122 kfree(index);
123
124out:
125 /*
126 * Return index (f_pos) of the looked up metadata block. Translate
127 * from internal f_pos to external f_pos which is offset by 3 because
128 * we invent "." and ".." entries which are not actually stored in the
129 * directory.
130 */
131 return length + 3;
132}
133
134
135static struct dentry *squashfs_lookup(struct inode *dir, struct dentry *dentry,
136 struct nameidata *nd)
137{
138 const unsigned char *name = dentry->d_name.name;
139 int len = dentry->d_name.len;
140 struct inode *inode = NULL;
141 struct squashfs_sb_info *msblk = dir->i_sb->s_fs_info;
142 struct squashfs_dir_header dirh;
143 struct squashfs_dir_entry *dire;
144 u64 block = squashfs_i(dir)->start + msblk->directory_table;
145 int offset = squashfs_i(dir)->offset;
146 int err, length = 0, dir_count, size;
147
148 TRACE("Entered squashfs_lookup [%llx:%x]\n", block, offset);
149
150 dire = kmalloc(sizeof(*dire) + SQUASHFS_NAME_LEN + 1, GFP_KERNEL);
151 if (dire == NULL) {
152 ERROR("Failed to allocate squashfs_dir_entry\n");
153 return ERR_PTR(-ENOMEM);
154 }
155
156 if (len > SQUASHFS_NAME_LEN) {
157 err = -ENAMETOOLONG;
158 goto failed;
159 }
160
161 length = get_dir_index_using_name(dir->i_sb, &block, &offset,
162 squashfs_i(dir)->dir_idx_start,
163 squashfs_i(dir)->dir_idx_offset,
164 squashfs_i(dir)->dir_idx_cnt, name, len);
165
166 while (length < i_size_read(dir)) {
167 /*
168 * Read directory header.
169 */
170 err = squashfs_read_metadata(dir->i_sb, &dirh, &block,
171 &offset, sizeof(dirh));
172 if (err < 0)
173 goto read_failure;
174
175 length += sizeof(dirh);
176
177 dir_count = le32_to_cpu(dirh.count) + 1;
178 while (dir_count--) {
179 /*
180 * Read directory entry.
181 */
182 err = squashfs_read_metadata(dir->i_sb, dire, &block,
183 &offset, sizeof(*dire));
184 if (err < 0)
185 goto read_failure;
186
187 size = le16_to_cpu(dire->size) + 1;
188
189 err = squashfs_read_metadata(dir->i_sb, dire->name,
190 &block, &offset, size);
191 if (err < 0)
192 goto read_failure;
193
194 length += sizeof(*dire) + size;
195
196 if (name[0] < dire->name[0])
197 goto exit_lookup;
198
199 if (len == size && !strncmp(name, dire->name, len)) {
200 unsigned int blk, off, ino_num;
201 long long ino;
202 blk = le32_to_cpu(dirh.start_block);
203 off = le16_to_cpu(dire->offset);
204 ino_num = le32_to_cpu(dirh.inode_number) +
205 (short) le16_to_cpu(dire->inode_number);
206 ino = SQUASHFS_MKINODE(blk, off);
207
208 TRACE("calling squashfs_iget for directory "
209 "entry %s, inode %x:%x, %d\n", name,
210 blk, off, ino_num);
211
212 inode = squashfs_iget(dir->i_sb, ino, ino_num);
213 if (IS_ERR(inode)) {
214 err = PTR_ERR(inode);
215 goto failed;
216 }
217
218 goto exit_lookup;
219 }
220 }
221 }
222
223exit_lookup:
224 kfree(dire);
225 if (inode)
226 return d_splice_alias(inode, dentry);
227 d_add(dentry, inode);
228 return ERR_PTR(0);
229
230read_failure:
231 ERROR("Unable to read directory block [%llx:%x]\n",
232 squashfs_i(dir)->start + msblk->directory_table,
233 squashfs_i(dir)->offset);
234failed:
235 kfree(dire);
236 return ERR_PTR(err);
237}
238
239
240const struct inode_operations squashfs_dir_inode_ops = {
241 .lookup = squashfs_lookup
242};
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
new file mode 100644
index 000000000000..6b2515d027d5
--- /dev/null
+++ b/fs/squashfs/squashfs.h
@@ -0,0 +1,90 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * squashfs.h
22 */
23
24#define TRACE(s, args...) pr_debug("SQUASHFS: "s, ## args)
25
26#define ERROR(s, args...) pr_err("SQUASHFS error: "s, ## args)
27
28#define WARNING(s, args...) pr_warning("SQUASHFS: "s, ## args)
29
30static inline struct squashfs_inode_info *squashfs_i(struct inode *inode)
31{
32 return list_entry(inode, struct squashfs_inode_info, vfs_inode);
33}
34
35/* block.c */
36extern int squashfs_read_data(struct super_block *, void **, u64, int, u64 *,
37 int);
38
39/* cache.c */
40extern struct squashfs_cache *squashfs_cache_init(char *, int, int);
41extern void squashfs_cache_delete(struct squashfs_cache *);
42extern struct squashfs_cache_entry *squashfs_cache_get(struct super_block *,
43 struct squashfs_cache *, u64, int);
44extern void squashfs_cache_put(struct squashfs_cache_entry *);
45extern int squashfs_copy_data(void *, struct squashfs_cache_entry *, int, int);
46extern int squashfs_read_metadata(struct super_block *, void *, u64 *,
47 int *, int);
48extern struct squashfs_cache_entry *squashfs_get_fragment(struct super_block *,
49 u64, int);
50extern struct squashfs_cache_entry *squashfs_get_datablock(struct super_block *,
51 u64, int);
52extern int squashfs_read_table(struct super_block *, void *, u64, int);
53
54/* export.c */
55extern __le64 *squashfs_read_inode_lookup_table(struct super_block *, u64,
56 unsigned int);
57
58/* fragment.c */
59extern int squashfs_frag_lookup(struct super_block *, unsigned int, u64 *);
60extern __le64 *squashfs_read_fragment_index_table(struct super_block *,
61 u64, unsigned int);
62
63/* id.c */
64extern int squashfs_get_id(struct super_block *, unsigned int, unsigned int *);
65extern __le64 *squashfs_read_id_index_table(struct super_block *, u64,
66 unsigned short);
67
68/* inode.c */
69extern struct inode *squashfs_iget(struct super_block *, long long,
70 unsigned int);
71extern int squashfs_read_inode(struct inode *, long long);
72
73/*
74 * Inodes and files operations
75 */
76
77/* dir.c */
78extern const struct file_operations squashfs_dir_ops;
79
80/* export.c */
81extern const struct export_operations squashfs_export_ops;
82
83/* file.c */
84extern const struct address_space_operations squashfs_aops;
85
86/* namei.c */
87extern const struct inode_operations squashfs_dir_inode_ops;
88
89/* symlink.c */
90extern const struct address_space_operations squashfs_symlink_aops;
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
new file mode 100644
index 000000000000..283daafc568e
--- /dev/null
+++ b/fs/squashfs/squashfs_fs.h
@@ -0,0 +1,380 @@
1#ifndef SQUASHFS_FS
2#define SQUASHFS_FS
3/*
4 * Squashfs
5 *
6 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
7 * Phillip Lougher <phillip@lougher.demon.co.uk>
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License
11 * as published by the Free Software Foundation; either version 2,
12 * or (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
22 *
23 * squashfs_fs.h
24 */
25
26#define SQUASHFS_CACHED_FRAGMENTS CONFIG_SQUASHFS_FRAGMENT_CACHE_SIZE
27#define SQUASHFS_MAJOR 4
28#define SQUASHFS_MINOR 0
29#define SQUASHFS_START 0
30
31/* size of metadata (inode and directory) blocks */
32#define SQUASHFS_METADATA_SIZE 8192
33#define SQUASHFS_METADATA_LOG 13
34
35/* default size of data blocks */
36#define SQUASHFS_FILE_SIZE 131072
37#define SQUASHFS_FILE_LOG 17
38
39#define SQUASHFS_FILE_MAX_SIZE 1048576
40#define SQUASHFS_FILE_MAX_LOG 20
41
42/* Max number of uids and gids */
43#define SQUASHFS_IDS 65536
44
45/* Max length of filename (not 255) */
46#define SQUASHFS_NAME_LEN 256
47
48#define SQUASHFS_INVALID_FRAG (0xffffffffU)
49#define SQUASHFS_INVALID_BLK (-1LL)
50
51/* Filesystem flags */
52#define SQUASHFS_NOI 0
53#define SQUASHFS_NOD 1
54#define SQUASHFS_NOF 3
55#define SQUASHFS_NO_FRAG 4
56#define SQUASHFS_ALWAYS_FRAG 5
57#define SQUASHFS_DUPLICATE 6
58#define SQUASHFS_EXPORT 7
59
60#define SQUASHFS_BIT(flag, bit) ((flag >> bit) & 1)
61
62#define SQUASHFS_UNCOMPRESSED_INODES(flags) SQUASHFS_BIT(flags, \
63 SQUASHFS_NOI)
64
65#define SQUASHFS_UNCOMPRESSED_DATA(flags) SQUASHFS_BIT(flags, \
66 SQUASHFS_NOD)
67
68#define SQUASHFS_UNCOMPRESSED_FRAGMENTS(flags) SQUASHFS_BIT(flags, \
69 SQUASHFS_NOF)
70
71#define SQUASHFS_NO_FRAGMENTS(flags) SQUASHFS_BIT(flags, \
72 SQUASHFS_NO_FRAG)
73
74#define SQUASHFS_ALWAYS_FRAGMENTS(flags) SQUASHFS_BIT(flags, \
75 SQUASHFS_ALWAYS_FRAG)
76
77#define SQUASHFS_DUPLICATES(flags) SQUASHFS_BIT(flags, \
78 SQUASHFS_DUPLICATE)
79
80#define SQUASHFS_EXPORTABLE(flags) SQUASHFS_BIT(flags, \
81 SQUASHFS_EXPORT)
82
83/* Max number of types and file types */
84#define SQUASHFS_DIR_TYPE 1
85#define SQUASHFS_REG_TYPE 2
86#define SQUASHFS_SYMLINK_TYPE 3
87#define SQUASHFS_BLKDEV_TYPE 4
88#define SQUASHFS_CHRDEV_TYPE 5
89#define SQUASHFS_FIFO_TYPE 6
90#define SQUASHFS_SOCKET_TYPE 7
91#define SQUASHFS_LDIR_TYPE 8
92#define SQUASHFS_LREG_TYPE 9
93#define SQUASHFS_LSYMLINK_TYPE 10
94#define SQUASHFS_LBLKDEV_TYPE 11
95#define SQUASHFS_LCHRDEV_TYPE 12
96#define SQUASHFS_LFIFO_TYPE 13
97#define SQUASHFS_LSOCKET_TYPE 14
98
99/* Flag whether block is compressed or uncompressed, bit is set if block is
100 * uncompressed */
101#define SQUASHFS_COMPRESSED_BIT (1 << 15)
102
103#define SQUASHFS_COMPRESSED_SIZE(B) (((B) & ~SQUASHFS_COMPRESSED_BIT) ? \
104 (B) & ~SQUASHFS_COMPRESSED_BIT : SQUASHFS_COMPRESSED_BIT)
105
106#define SQUASHFS_COMPRESSED(B) (!((B) & SQUASHFS_COMPRESSED_BIT))
107
108#define SQUASHFS_COMPRESSED_BIT_BLOCK (1 << 24)
109
110#define SQUASHFS_COMPRESSED_SIZE_BLOCK(B) ((B) & \
111 ~SQUASHFS_COMPRESSED_BIT_BLOCK)
112
113#define SQUASHFS_COMPRESSED_BLOCK(B) (!((B) & SQUASHFS_COMPRESSED_BIT_BLOCK))
114
115/*
116 * Inode number ops. Inodes consist of a compressed block number, and an
117 * uncompressed offset within that block
118 */
119#define SQUASHFS_INODE_BLK(A) ((unsigned int) ((A) >> 16))
120
121#define SQUASHFS_INODE_OFFSET(A) ((unsigned int) ((A) & 0xffff))
122
123#define SQUASHFS_MKINODE(A, B) ((long long)(((long long) (A)\
124 << 16) + (B)))
125
126/* Translate between VFS mode and squashfs mode */
127#define SQUASHFS_MODE(A) ((A) & 0xfff)
128
129/* fragment and fragment table defines */
130#define SQUASHFS_FRAGMENT_BYTES(A) \
131 ((A) * sizeof(struct squashfs_fragment_entry))
132
133#define SQUASHFS_FRAGMENT_INDEX(A) (SQUASHFS_FRAGMENT_BYTES(A) / \
134 SQUASHFS_METADATA_SIZE)
135
136#define SQUASHFS_FRAGMENT_INDEX_OFFSET(A) (SQUASHFS_FRAGMENT_BYTES(A) % \
137 SQUASHFS_METADATA_SIZE)
138
139#define SQUASHFS_FRAGMENT_INDEXES(A) ((SQUASHFS_FRAGMENT_BYTES(A) + \
140 SQUASHFS_METADATA_SIZE - 1) / \
141 SQUASHFS_METADATA_SIZE)
142
143#define SQUASHFS_FRAGMENT_INDEX_BYTES(A) (SQUASHFS_FRAGMENT_INDEXES(A) *\
144 sizeof(u64))
145
146/* inode lookup table defines */
147#define SQUASHFS_LOOKUP_BYTES(A) ((A) * sizeof(u64))
148
149#define SQUASHFS_LOOKUP_BLOCK(A) (SQUASHFS_LOOKUP_BYTES(A) / \
150 SQUASHFS_METADATA_SIZE)
151
152#define SQUASHFS_LOOKUP_BLOCK_OFFSET(A) (SQUASHFS_LOOKUP_BYTES(A) % \
153 SQUASHFS_METADATA_SIZE)
154
155#define SQUASHFS_LOOKUP_BLOCKS(A) ((SQUASHFS_LOOKUP_BYTES(A) + \
156 SQUASHFS_METADATA_SIZE - 1) / \
157 SQUASHFS_METADATA_SIZE)
158
159#define SQUASHFS_LOOKUP_BLOCK_BYTES(A) (SQUASHFS_LOOKUP_BLOCKS(A) *\
160 sizeof(u64))
161
162/* uid/gid lookup table defines */
163#define SQUASHFS_ID_BYTES(A) ((A) * sizeof(unsigned int))
164
165#define SQUASHFS_ID_BLOCK(A) (SQUASHFS_ID_BYTES(A) / \
166 SQUASHFS_METADATA_SIZE)
167
168#define SQUASHFS_ID_BLOCK_OFFSET(A) (SQUASHFS_ID_BYTES(A) % \
169 SQUASHFS_METADATA_SIZE)
170
171#define SQUASHFS_ID_BLOCKS(A) ((SQUASHFS_ID_BYTES(A) + \
172 SQUASHFS_METADATA_SIZE - 1) / \
173 SQUASHFS_METADATA_SIZE)
174
175#define SQUASHFS_ID_BLOCK_BYTES(A) (SQUASHFS_ID_BLOCKS(A) *\
176 sizeof(u64))
177
178/* cached data constants for filesystem */
179#define SQUASHFS_CACHED_BLKS 8
180
181#define SQUASHFS_MAX_FILE_SIZE_LOG 64
182
183#define SQUASHFS_MAX_FILE_SIZE (1LL << \
184 (SQUASHFS_MAX_FILE_SIZE_LOG - 2))
185
186#define SQUASHFS_MARKER_BYTE 0xff
187
188/* meta index cache */
189#define SQUASHFS_META_INDEXES (SQUASHFS_METADATA_SIZE / sizeof(unsigned int))
190#define SQUASHFS_META_ENTRIES 127
191#define SQUASHFS_META_SLOTS 8
192
193struct meta_entry {
194 u64 data_block;
195 unsigned int index_block;
196 unsigned short offset;
197 unsigned short pad;
198};
199
200struct meta_index {
201 unsigned int inode_number;
202 unsigned int offset;
203 unsigned short entries;
204 unsigned short skip;
205 unsigned short locked;
206 unsigned short pad;
207 struct meta_entry meta_entry[SQUASHFS_META_ENTRIES];
208};
209
210
211/*
212 * definitions for structures on disk
213 */
214#define ZLIB_COMPRESSION 1
215
216struct squashfs_super_block {
217 __le32 s_magic;
218 __le32 inodes;
219 __le32 mkfs_time;
220 __le32 block_size;
221 __le32 fragments;
222 __le16 compression;
223 __le16 block_log;
224 __le16 flags;
225 __le16 no_ids;
226 __le16 s_major;
227 __le16 s_minor;
228 __le64 root_inode;
229 __le64 bytes_used;
230 __le64 id_table_start;
231 __le64 xattr_table_start;
232 __le64 inode_table_start;
233 __le64 directory_table_start;
234 __le64 fragment_table_start;
235 __le64 lookup_table_start;
236};
237
238struct squashfs_dir_index {
239 __le32 index;
240 __le32 start_block;
241 __le32 size;
242 unsigned char name[0];
243};
244
245struct squashfs_base_inode {
246 __le16 inode_type;
247 __le16 mode;
248 __le16 uid;
249 __le16 guid;
250 __le32 mtime;
251 __le32 inode_number;
252};
253
254struct squashfs_ipc_inode {
255 __le16 inode_type;
256 __le16 mode;
257 __le16 uid;
258 __le16 guid;
259 __le32 mtime;
260 __le32 inode_number;
261 __le32 nlink;
262};
263
264struct squashfs_dev_inode {
265 __le16 inode_type;
266 __le16 mode;
267 __le16 uid;
268 __le16 guid;
269 __le32 mtime;
270 __le32 inode_number;
271 __le32 nlink;
272 __le32 rdev;
273};
274
275struct squashfs_symlink_inode {
276 __le16 inode_type;
277 __le16 mode;
278 __le16 uid;
279 __le16 guid;
280 __le32 mtime;
281 __le32 inode_number;
282 __le32 nlink;
283 __le32 symlink_size;
284 char symlink[0];
285};
286
287struct squashfs_reg_inode {
288 __le16 inode_type;
289 __le16 mode;
290 __le16 uid;
291 __le16 guid;
292 __le32 mtime;
293 __le32 inode_number;
294 __le32 start_block;
295 __le32 fragment;
296 __le32 offset;
297 __le32 file_size;
298 __le16 block_list[0];
299};
300
301struct squashfs_lreg_inode {
302 __le16 inode_type;
303 __le16 mode;
304 __le16 uid;
305 __le16 guid;
306 __le32 mtime;
307 __le32 inode_number;
308 __le64 start_block;
309 __le64 file_size;
310 __le64 sparse;
311 __le32 nlink;
312 __le32 fragment;
313 __le32 offset;
314 __le32 xattr;
315 __le16 block_list[0];
316};
317
318struct squashfs_dir_inode {
319 __le16 inode_type;
320 __le16 mode;
321 __le16 uid;
322 __le16 guid;
323 __le32 mtime;
324 __le32 inode_number;
325 __le32 start_block;
326 __le32 nlink;
327 __le16 file_size;
328 __le16 offset;
329 __le32 parent_inode;
330};
331
332struct squashfs_ldir_inode {
333 __le16 inode_type;
334 __le16 mode;
335 __le16 uid;
336 __le16 guid;
337 __le32 mtime;
338 __le32 inode_number;
339 __le32 nlink;
340 __le32 file_size;
341 __le32 start_block;
342 __le32 parent_inode;
343 __le16 i_count;
344 __le16 offset;
345 __le32 xattr;
346 struct squashfs_dir_index index[0];
347};
348
349union squashfs_inode {
350 struct squashfs_base_inode base;
351 struct squashfs_dev_inode dev;
352 struct squashfs_symlink_inode symlink;
353 struct squashfs_reg_inode reg;
354 struct squashfs_lreg_inode lreg;
355 struct squashfs_dir_inode dir;
356 struct squashfs_ldir_inode ldir;
357 struct squashfs_ipc_inode ipc;
358};
359
360struct squashfs_dir_entry {
361 __le16 offset;
362 __le16 inode_number;
363 __le16 type;
364 __le16 size;
365 char name[0];
366};
367
368struct squashfs_dir_header {
369 __le32 count;
370 __le32 start_block;
371 __le32 inode_number;
372};
373
374struct squashfs_fragment_entry {
375 __le64 start_block;
376 __le32 size;
377 unsigned int unused;
378};
379
380#endif
diff --git a/fs/squashfs/squashfs_fs_i.h b/fs/squashfs/squashfs_fs_i.h
new file mode 100644
index 000000000000..fbfca30c0c68
--- /dev/null
+++ b/fs/squashfs/squashfs_fs_i.h
@@ -0,0 +1,45 @@
1#ifndef SQUASHFS_FS_I
2#define SQUASHFS_FS_I
3/*
4 * Squashfs
5 *
6 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
7 * Phillip Lougher <phillip@lougher.demon.co.uk>
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License
11 * as published by the Free Software Foundation; either version 2,
12 * or (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
22 *
23 * squashfs_fs_i.h
24 */
25
26struct squashfs_inode_info {
27 u64 start;
28 int offset;
29 union {
30 struct {
31 u64 fragment_block;
32 int fragment_size;
33 int fragment_offset;
34 u64 block_list_start;
35 };
36 struct {
37 u64 dir_idx_start;
38 int dir_idx_offset;
39 int dir_idx_cnt;
40 int parent;
41 };
42 };
43 struct inode vfs_inode;
44};
45#endif
diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h
new file mode 100644
index 000000000000..c8c65614dd1c
--- /dev/null
+++ b/fs/squashfs/squashfs_fs_sb.h
@@ -0,0 +1,76 @@
1#ifndef SQUASHFS_FS_SB
2#define SQUASHFS_FS_SB
3/*
4 * Squashfs
5 *
6 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
7 * Phillip Lougher <phillip@lougher.demon.co.uk>
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License
11 * as published by the Free Software Foundation; either version 2,
12 * or (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
22 *
23 * squashfs_fs_sb.h
24 */
25
26#include "squashfs_fs.h"
27
28struct squashfs_cache {
29 char *name;
30 int entries;
31 int next_blk;
32 int num_waiters;
33 int unused;
34 int block_size;
35 int pages;
36 spinlock_t lock;
37 wait_queue_head_t wait_queue;
38 struct squashfs_cache_entry *entry;
39};
40
41struct squashfs_cache_entry {
42 u64 block;
43 int length;
44 int refcount;
45 u64 next_index;
46 int pending;
47 int error;
48 int num_waiters;
49 wait_queue_head_t wait_queue;
50 struct squashfs_cache *cache;
51 void **data;
52};
53
54struct squashfs_sb_info {
55 int devblksize;
56 int devblksize_log2;
57 struct squashfs_cache *block_cache;
58 struct squashfs_cache *fragment_cache;
59 struct squashfs_cache *read_page;
60 int next_meta_index;
61 __le64 *id_table;
62 __le64 *fragment_index;
63 unsigned int *fragment_index_2;
64 struct mutex read_data_mutex;
65 struct mutex meta_index_mutex;
66 struct meta_index *meta_index;
67 z_stream stream;
68 __le64 *inode_lookup_table;
69 u64 inode_table;
70 u64 directory_table;
71 unsigned int block_size;
72 unsigned short block_log;
73 long long bytes_used;
74 unsigned int inodes;
75};
76#endif
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
new file mode 100644
index 000000000000..071df5b5b491
--- /dev/null
+++ b/fs/squashfs/super.c
@@ -0,0 +1,441 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * super.c
22 */
23
24/*
25 * This file implements code to read the superblock, read and initialise
26 * in-memory structures at mount time, and all the VFS glue code to register
27 * the filesystem.
28 */
29
30#include <linux/fs.h>
31#include <linux/vfs.h>
32#include <linux/slab.h>
33#include <linux/mutex.h>
34#include <linux/pagemap.h>
35#include <linux/init.h>
36#include <linux/module.h>
37#include <linux/zlib.h>
38#include <linux/magic.h>
39
40#include "squashfs_fs.h"
41#include "squashfs_fs_sb.h"
42#include "squashfs_fs_i.h"
43#include "squashfs.h"
44
45static struct file_system_type squashfs_fs_type;
46static struct super_operations squashfs_super_ops;
47
48static int supported_squashfs_filesystem(short major, short minor, short comp)
49{
50 if (major < SQUASHFS_MAJOR) {
51 ERROR("Major/Minor mismatch, older Squashfs %d.%d "
52 "filesystems are unsupported\n", major, minor);
53 return -EINVAL;
54 } else if (major > SQUASHFS_MAJOR || minor > SQUASHFS_MINOR) {
55 ERROR("Major/Minor mismatch, trying to mount newer "
56 "%d.%d filesystem\n", major, minor);
57 ERROR("Please update your kernel\n");
58 return -EINVAL;
59 }
60
61 if (comp != ZLIB_COMPRESSION)
62 return -EINVAL;
63
64 return 0;
65}
66
67
68static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
69{
70 struct squashfs_sb_info *msblk;
71 struct squashfs_super_block *sblk = NULL;
72 char b[BDEVNAME_SIZE];
73 struct inode *root;
74 long long root_inode;
75 unsigned short flags;
76 unsigned int fragments;
77 u64 lookup_table_start;
78 int err;
79
80 TRACE("Entered squashfs_fill_superblock\n");
81
82 sb->s_fs_info = kzalloc(sizeof(*msblk), GFP_KERNEL);
83 if (sb->s_fs_info == NULL) {
84 ERROR("Failed to allocate squashfs_sb_info\n");
85 return -ENOMEM;
86 }
87 msblk = sb->s_fs_info;
88
89 msblk->stream.workspace = kmalloc(zlib_inflate_workspacesize(),
90 GFP_KERNEL);
91 if (msblk->stream.workspace == NULL) {
92 ERROR("Failed to allocate zlib workspace\n");
93 goto failure;
94 }
95
96 sblk = kzalloc(sizeof(*sblk), GFP_KERNEL);
97 if (sblk == NULL) {
98 ERROR("Failed to allocate squashfs_super_block\n");
99 goto failure;
100 }
101
102 msblk->devblksize = sb_min_blocksize(sb, BLOCK_SIZE);
103 msblk->devblksize_log2 = ffz(~msblk->devblksize);
104
105 mutex_init(&msblk->read_data_mutex);
106 mutex_init(&msblk->meta_index_mutex);
107
108 /*
109 * msblk->bytes_used is checked in squashfs_read_table to ensure reads
110 * are not beyond filesystem end. But as we're using
111 * squashfs_read_table here to read the superblock (including the value
112 * of bytes_used) we need to set it to an initial sensible dummy value
113 */
114 msblk->bytes_used = sizeof(*sblk);
115 err = squashfs_read_table(sb, sblk, SQUASHFS_START, sizeof(*sblk));
116
117 if (err < 0) {
118 ERROR("unable to read squashfs_super_block\n");
119 goto failed_mount;
120 }
121
122 /* Check it is a SQUASHFS superblock */
123 sb->s_magic = le32_to_cpu(sblk->s_magic);
124 if (sb->s_magic != SQUASHFS_MAGIC) {
125 if (!silent)
126 ERROR("Can't find a SQUASHFS superblock on %s\n",
127 bdevname(sb->s_bdev, b));
128 err = -EINVAL;
129 goto failed_mount;
130 }
131
132 /* Check the MAJOR & MINOR versions and compression type */
133 err = supported_squashfs_filesystem(le16_to_cpu(sblk->s_major),
134 le16_to_cpu(sblk->s_minor),
135 le16_to_cpu(sblk->compression));
136 if (err < 0)
137 goto failed_mount;
138
139 err = -EINVAL;
140
141 /*
142 * Check if there's xattrs in the filesystem. These are not
143 * supported in this version, so warn that they will be ignored.
144 */
145 if (le64_to_cpu(sblk->xattr_table_start) != SQUASHFS_INVALID_BLK)
146 ERROR("Xattrs in filesystem, these will be ignored\n");
147
148 /* Check the filesystem does not extend beyond the end of the
149 block device */
150 msblk->bytes_used = le64_to_cpu(sblk->bytes_used);
151 if (msblk->bytes_used < 0 || msblk->bytes_used >
152 i_size_read(sb->s_bdev->bd_inode))
153 goto failed_mount;
154
155 /* Check block size for sanity */
156 msblk->block_size = le32_to_cpu(sblk->block_size);
157 if (msblk->block_size > SQUASHFS_FILE_MAX_SIZE)
158 goto failed_mount;
159
160 msblk->block_log = le16_to_cpu(sblk->block_log);
161 if (msblk->block_log > SQUASHFS_FILE_MAX_LOG)
162 goto failed_mount;
163
164 /* Check the root inode for sanity */
165 root_inode = le64_to_cpu(sblk->root_inode);
166 if (SQUASHFS_INODE_OFFSET(root_inode) > SQUASHFS_METADATA_SIZE)
167 goto failed_mount;
168
169 msblk->inode_table = le64_to_cpu(sblk->inode_table_start);
170 msblk->directory_table = le64_to_cpu(sblk->directory_table_start);
171 msblk->inodes = le32_to_cpu(sblk->inodes);
172 flags = le16_to_cpu(sblk->flags);
173
174 TRACE("Found valid superblock on %s\n", bdevname(sb->s_bdev, b));
175 TRACE("Inodes are %scompressed\n", SQUASHFS_UNCOMPRESSED_INODES(flags)
176 ? "un" : "");
177 TRACE("Data is %scompressed\n", SQUASHFS_UNCOMPRESSED_DATA(flags)
178 ? "un" : "");
179 TRACE("Filesystem size %lld bytes\n", msblk->bytes_used);
180 TRACE("Block size %d\n", msblk->block_size);
181 TRACE("Number of inodes %d\n", msblk->inodes);
182 TRACE("Number of fragments %d\n", le32_to_cpu(sblk->fragments));
183 TRACE("Number of ids %d\n", le16_to_cpu(sblk->no_ids));
184 TRACE("sblk->inode_table_start %llx\n", msblk->inode_table);
185 TRACE("sblk->directory_table_start %llx\n", msblk->directory_table);
186 TRACE("sblk->fragment_table_start %llx\n",
187 (u64) le64_to_cpu(sblk->fragment_table_start));
188 TRACE("sblk->id_table_start %llx\n",
189 (u64) le64_to_cpu(sblk->id_table_start));
190
191 sb->s_maxbytes = MAX_LFS_FILESIZE;
192 sb->s_flags |= MS_RDONLY;
193 sb->s_op = &squashfs_super_ops;
194
195 err = -ENOMEM;
196
197 msblk->block_cache = squashfs_cache_init("metadata",
198 SQUASHFS_CACHED_BLKS, SQUASHFS_METADATA_SIZE);
199 if (msblk->block_cache == NULL)
200 goto failed_mount;
201
202 /* Allocate read_page block */
203 msblk->read_page = squashfs_cache_init("data", 1, msblk->block_size);
204 if (msblk->read_page == NULL) {
205 ERROR("Failed to allocate read_page block\n");
206 goto failed_mount;
207 }
208
209 /* Allocate and read id index table */
210 msblk->id_table = squashfs_read_id_index_table(sb,
211 le64_to_cpu(sblk->id_table_start), le16_to_cpu(sblk->no_ids));
212 if (IS_ERR(msblk->id_table)) {
213 err = PTR_ERR(msblk->id_table);
214 msblk->id_table = NULL;
215 goto failed_mount;
216 }
217
218 fragments = le32_to_cpu(sblk->fragments);
219 if (fragments == 0)
220 goto allocate_lookup_table;
221
222 msblk->fragment_cache = squashfs_cache_init("fragment",
223 SQUASHFS_CACHED_FRAGMENTS, msblk->block_size);
224 if (msblk->fragment_cache == NULL) {
225 err = -ENOMEM;
226 goto failed_mount;
227 }
228
229 /* Allocate and read fragment index table */
230 msblk->fragment_index = squashfs_read_fragment_index_table(sb,
231 le64_to_cpu(sblk->fragment_table_start), fragments);
232 if (IS_ERR(msblk->fragment_index)) {
233 err = PTR_ERR(msblk->fragment_index);
234 msblk->fragment_index = NULL;
235 goto failed_mount;
236 }
237
238allocate_lookup_table:
239 lookup_table_start = le64_to_cpu(sblk->lookup_table_start);
240 if (lookup_table_start == SQUASHFS_INVALID_BLK)
241 goto allocate_root;
242
243 /* Allocate and read inode lookup table */
244 msblk->inode_lookup_table = squashfs_read_inode_lookup_table(sb,
245 lookup_table_start, msblk->inodes);
246 if (IS_ERR(msblk->inode_lookup_table)) {
247 err = PTR_ERR(msblk->inode_lookup_table);
248 msblk->inode_lookup_table = NULL;
249 goto failed_mount;
250 }
251
252 sb->s_export_op = &squashfs_export_ops;
253
254allocate_root:
255 root = new_inode(sb);
256 if (!root) {
257 err = -ENOMEM;
258 goto failed_mount;
259 }
260
261 err = squashfs_read_inode(root, root_inode);
262 if (err) {
263 iget_failed(root);
264 goto failed_mount;
265 }
266 insert_inode_hash(root);
267
268 sb->s_root = d_alloc_root(root);
269 if (sb->s_root == NULL) {
270 ERROR("Root inode create failed\n");
271 err = -ENOMEM;
272 iput(root);
273 goto failed_mount;
274 }
275
276 TRACE("Leaving squashfs_fill_super\n");
277 kfree(sblk);
278 return 0;
279
280failed_mount:
281 squashfs_cache_delete(msblk->block_cache);
282 squashfs_cache_delete(msblk->fragment_cache);
283 squashfs_cache_delete(msblk->read_page);
284 kfree(msblk->inode_lookup_table);
285 kfree(msblk->fragment_index);
286 kfree(msblk->id_table);
287 kfree(msblk->stream.workspace);
288 kfree(sb->s_fs_info);
289 sb->s_fs_info = NULL;
290 kfree(sblk);
291 return err;
292
293failure:
294 kfree(msblk->stream.workspace);
295 kfree(sb->s_fs_info);
296 sb->s_fs_info = NULL;
297 return -ENOMEM;
298}
299
300
301static int squashfs_statfs(struct dentry *dentry, struct kstatfs *buf)
302{
303 struct squashfs_sb_info *msblk = dentry->d_sb->s_fs_info;
304
305 TRACE("Entered squashfs_statfs\n");
306
307 buf->f_type = SQUASHFS_MAGIC;
308 buf->f_bsize = msblk->block_size;
309 buf->f_blocks = ((msblk->bytes_used - 1) >> msblk->block_log) + 1;
310 buf->f_bfree = buf->f_bavail = 0;
311 buf->f_files = msblk->inodes;
312 buf->f_ffree = 0;
313 buf->f_namelen = SQUASHFS_NAME_LEN;
314
315 return 0;
316}
317
318
319static int squashfs_remount(struct super_block *sb, int *flags, char *data)
320{
321 *flags |= MS_RDONLY;
322 return 0;
323}
324
325
326static void squashfs_put_super(struct super_block *sb)
327{
328 if (sb->s_fs_info) {
329 struct squashfs_sb_info *sbi = sb->s_fs_info;
330 squashfs_cache_delete(sbi->block_cache);
331 squashfs_cache_delete(sbi->fragment_cache);
332 squashfs_cache_delete(sbi->read_page);
333 kfree(sbi->id_table);
334 kfree(sbi->fragment_index);
335 kfree(sbi->meta_index);
336 kfree(sbi->stream.workspace);
337 kfree(sb->s_fs_info);
338 sb->s_fs_info = NULL;
339 }
340}
341
342
343static int squashfs_get_sb(struct file_system_type *fs_type, int flags,
344 const char *dev_name, void *data,
345 struct vfsmount *mnt)
346{
347 return get_sb_bdev(fs_type, flags, dev_name, data, squashfs_fill_super,
348 mnt);
349}
350
351
352static struct kmem_cache *squashfs_inode_cachep;
353
354
355static void init_once(void *foo)
356{
357 struct squashfs_inode_info *ei = foo;
358
359 inode_init_once(&ei->vfs_inode);
360}
361
362
363static int __init init_inodecache(void)
364{
365 squashfs_inode_cachep = kmem_cache_create("squashfs_inode_cache",
366 sizeof(struct squashfs_inode_info), 0,
367 SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT, init_once);
368
369 return squashfs_inode_cachep ? 0 : -ENOMEM;
370}
371
372
373static void destroy_inodecache(void)
374{
375 kmem_cache_destroy(squashfs_inode_cachep);
376}
377
378
379static int __init init_squashfs_fs(void)
380{
381 int err = init_inodecache();
382
383 if (err)
384 return err;
385
386 err = register_filesystem(&squashfs_fs_type);
387 if (err) {
388 destroy_inodecache();
389 return err;
390 }
391
392 printk(KERN_INFO "squashfs: version 4.0 (2009/01/03) "
393 "Phillip Lougher\n");
394
395 return 0;
396}
397
398
399static void __exit exit_squashfs_fs(void)
400{
401 unregister_filesystem(&squashfs_fs_type);
402 destroy_inodecache();
403}
404
405
406static struct inode *squashfs_alloc_inode(struct super_block *sb)
407{
408 struct squashfs_inode_info *ei =
409 kmem_cache_alloc(squashfs_inode_cachep, GFP_KERNEL);
410
411 return ei ? &ei->vfs_inode : NULL;
412}
413
414
415static void squashfs_destroy_inode(struct inode *inode)
416{
417 kmem_cache_free(squashfs_inode_cachep, squashfs_i(inode));
418}
419
420
421static struct file_system_type squashfs_fs_type = {
422 .owner = THIS_MODULE,
423 .name = "squashfs",
424 .get_sb = squashfs_get_sb,
425 .kill_sb = kill_block_super,
426 .fs_flags = FS_REQUIRES_DEV
427};
428
429static struct super_operations squashfs_super_ops = {
430 .alloc_inode = squashfs_alloc_inode,
431 .destroy_inode = squashfs_destroy_inode,
432 .statfs = squashfs_statfs,
433 .put_super = squashfs_put_super,
434 .remount_fs = squashfs_remount
435};
436
437module_init(init_squashfs_fs);
438module_exit(exit_squashfs_fs);
439MODULE_DESCRIPTION("squashfs 4.0, a compressed read-only filesystem");
440MODULE_AUTHOR("Phillip Lougher <phillip@lougher.demon.co.uk>");
441MODULE_LICENSE("GPL");
diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c
new file mode 100644
index 000000000000..83d87880aac8
--- /dev/null
+++ b/fs/squashfs/symlink.c
@@ -0,0 +1,118 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * symlink.c
22 */
23
24/*
25 * This file implements code to handle symbolic links.
26 *
27 * The data contents of symbolic links are stored inside the symbolic
28 * link inode within the inode table. This allows the normally small symbolic
29 * link to be compressed as part of the inode table, achieving much greater
30 * compression than if the symbolic link was compressed individually.
31 */
32
33#include <linux/fs.h>
34#include <linux/vfs.h>
35#include <linux/kernel.h>
36#include <linux/slab.h>
37#include <linux/string.h>
38#include <linux/pagemap.h>
39#include <linux/zlib.h>
40
41#include "squashfs_fs.h"
42#include "squashfs_fs_sb.h"
43#include "squashfs_fs_i.h"
44#include "squashfs.h"
45
46static int squashfs_symlink_readpage(struct file *file, struct page *page)
47{
48 struct inode *inode = page->mapping->host;
49 struct super_block *sb = inode->i_sb;
50 struct squashfs_sb_info *msblk = sb->s_fs_info;
51 int index = page->index << PAGE_CACHE_SHIFT;
52 u64 block = squashfs_i(inode)->start;
53 int offset = squashfs_i(inode)->offset;
54 int length = min_t(int, i_size_read(inode) - index, PAGE_CACHE_SIZE);
55 int bytes, copied;
56 void *pageaddr;
57 struct squashfs_cache_entry *entry;
58
59 TRACE("Entered squashfs_symlink_readpage, page index %ld, start block "
60 "%llx, offset %x\n", page->index, block, offset);
61
62 /*
63 * Skip index bytes into symlink metadata.
64 */
65 if (index) {
66 bytes = squashfs_read_metadata(sb, NULL, &block, &offset,
67 index);
68 if (bytes < 0) {
69 ERROR("Unable to read symlink [%llx:%x]\n",
70 squashfs_i(inode)->start,
71 squashfs_i(inode)->offset);
72 goto error_out;
73 }
74 }
75
76 /*
77 * Read length bytes from symlink metadata. Squashfs_read_metadata
78 * is not used here because it can sleep and we want to use
79 * kmap_atomic to map the page. Instead call the underlying
80 * squashfs_cache_get routine. As length bytes may overlap metadata
81 * blocks, we may need to call squashfs_cache_get multiple times.
82 */
83 for (bytes = 0; bytes < length; offset = 0, bytes += copied) {
84 entry = squashfs_cache_get(sb, msblk->block_cache, block, 0);
85 if (entry->error) {
86 ERROR("Unable to read symlink [%llx:%x]\n",
87 squashfs_i(inode)->start,
88 squashfs_i(inode)->offset);
89 squashfs_cache_put(entry);
90 goto error_out;
91 }
92
93 pageaddr = kmap_atomic(page, KM_USER0);
94 copied = squashfs_copy_data(pageaddr + bytes, entry, offset,
95 length - bytes);
96 if (copied == length - bytes)
97 memset(pageaddr + length, 0, PAGE_CACHE_SIZE - length);
98 else
99 block = entry->next_index;
100 kunmap_atomic(pageaddr, KM_USER0);
101 squashfs_cache_put(entry);
102 }
103
104 flush_dcache_page(page);
105 SetPageUptodate(page);
106 unlock_page(page);
107 return 0;
108
109error_out:
110 SetPageError(page);
111 unlock_page(page);
112 return 0;
113}
114
115
116const struct address_space_operations squashfs_symlink_aops = {
117 .readpage = squashfs_symlink_readpage
118};
diff --git a/fs/stat.c b/fs/stat.c
index 7c46fbeb8b76..2db740a0cfb5 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -152,7 +152,7 @@ static int cp_old_stat(struct kstat *stat, struct __old_kernel_stat __user * sta
152 return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0; 152 return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0;
153} 153}
154 154
155asmlinkage long sys_stat(char __user * filename, struct __old_kernel_stat __user * statbuf) 155SYSCALL_DEFINE2(stat, char __user *, filename, struct __old_kernel_stat __user *, statbuf)
156{ 156{
157 struct kstat stat; 157 struct kstat stat;
158 int error = vfs_stat_fd(AT_FDCWD, filename, &stat); 158 int error = vfs_stat_fd(AT_FDCWD, filename, &stat);
@@ -162,7 +162,8 @@ asmlinkage long sys_stat(char __user * filename, struct __old_kernel_stat __user
162 162
163 return error; 163 return error;
164} 164}
165asmlinkage long sys_lstat(char __user * filename, struct __old_kernel_stat __user * statbuf) 165
166SYSCALL_DEFINE2(lstat, char __user *, filename, struct __old_kernel_stat __user *, statbuf)
166{ 167{
167 struct kstat stat; 168 struct kstat stat;
168 int error = vfs_lstat_fd(AT_FDCWD, filename, &stat); 169 int error = vfs_lstat_fd(AT_FDCWD, filename, &stat);
@@ -172,7 +173,8 @@ asmlinkage long sys_lstat(char __user * filename, struct __old_kernel_stat __use
172 173
173 return error; 174 return error;
174} 175}
175asmlinkage long sys_fstat(unsigned int fd, struct __old_kernel_stat __user * statbuf) 176
177SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, statbuf)
176{ 178{
177 struct kstat stat; 179 struct kstat stat;
178 int error = vfs_fstat(fd, &stat); 180 int error = vfs_fstat(fd, &stat);
@@ -235,7 +237,7 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf)
235 return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0; 237 return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0;
236} 238}
237 239
238asmlinkage long sys_newstat(char __user *filename, struct stat __user *statbuf) 240SYSCALL_DEFINE2(newstat, char __user *, filename, struct stat __user *, statbuf)
239{ 241{
240 struct kstat stat; 242 struct kstat stat;
241 int error = vfs_stat_fd(AT_FDCWD, filename, &stat); 243 int error = vfs_stat_fd(AT_FDCWD, filename, &stat);
@@ -246,7 +248,7 @@ asmlinkage long sys_newstat(char __user *filename, struct stat __user *statbuf)
246 return error; 248 return error;
247} 249}
248 250
249asmlinkage long sys_newlstat(char __user *filename, struct stat __user *statbuf) 251SYSCALL_DEFINE2(newlstat, char __user *, filename, struct stat __user *, statbuf)
250{ 252{
251 struct kstat stat; 253 struct kstat stat;
252 int error = vfs_lstat_fd(AT_FDCWD, filename, &stat); 254 int error = vfs_lstat_fd(AT_FDCWD, filename, &stat);
@@ -258,8 +260,8 @@ asmlinkage long sys_newlstat(char __user *filename, struct stat __user *statbuf)
258} 260}
259 261
260#if !defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_SYS_NEWFSTATAT) 262#if !defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_SYS_NEWFSTATAT)
261asmlinkage long sys_newfstatat(int dfd, char __user *filename, 263SYSCALL_DEFINE4(newfstatat, int, dfd, char __user *, filename,
262 struct stat __user *statbuf, int flag) 264 struct stat __user *, statbuf, int, flag)
263{ 265{
264 struct kstat stat; 266 struct kstat stat;
265 int error = -EINVAL; 267 int error = -EINVAL;
@@ -280,7 +282,7 @@ out:
280} 282}
281#endif 283#endif
282 284
283asmlinkage long sys_newfstat(unsigned int fd, struct stat __user *statbuf) 285SYSCALL_DEFINE2(newfstat, unsigned int, fd, struct stat __user *, statbuf)
284{ 286{
285 struct kstat stat; 287 struct kstat stat;
286 int error = vfs_fstat(fd, &stat); 288 int error = vfs_fstat(fd, &stat);
@@ -291,8 +293,8 @@ asmlinkage long sys_newfstat(unsigned int fd, struct stat __user *statbuf)
291 return error; 293 return error;
292} 294}
293 295
294asmlinkage long sys_readlinkat(int dfd, const char __user *pathname, 296SYSCALL_DEFINE4(readlinkat, int, dfd, const char __user *, pathname,
295 char __user *buf, int bufsiz) 297 char __user *, buf, int, bufsiz)
296{ 298{
297 struct path path; 299 struct path path;
298 int error; 300 int error;
@@ -305,7 +307,7 @@ asmlinkage long sys_readlinkat(int dfd, const char __user *pathname,
305 struct inode *inode = path.dentry->d_inode; 307 struct inode *inode = path.dentry->d_inode;
306 308
307 error = -EINVAL; 309 error = -EINVAL;
308 if (inode->i_op && inode->i_op->readlink) { 310 if (inode->i_op->readlink) {
309 error = security_inode_readlink(path.dentry); 311 error = security_inode_readlink(path.dentry);
310 if (!error) { 312 if (!error) {
311 touch_atime(path.mnt, path.dentry); 313 touch_atime(path.mnt, path.dentry);
@@ -318,8 +320,8 @@ asmlinkage long sys_readlinkat(int dfd, const char __user *pathname,
318 return error; 320 return error;
319} 321}
320 322
321asmlinkage long sys_readlink(const char __user *path, char __user *buf, 323SYSCALL_DEFINE3(readlink, const char __user *, path, char __user *, buf,
322 int bufsiz) 324 int, bufsiz)
323{ 325{
324 return sys_readlinkat(AT_FDCWD, path, buf, bufsiz); 326 return sys_readlinkat(AT_FDCWD, path, buf, bufsiz);
325} 327}
@@ -365,7 +367,7 @@ static long cp_new_stat64(struct kstat *stat, struct stat64 __user *statbuf)
365 return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0; 367 return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0;
366} 368}
367 369
368asmlinkage long sys_stat64(char __user * filename, struct stat64 __user * statbuf) 370SYSCALL_DEFINE2(stat64, char __user *, filename, struct stat64 __user *, statbuf)
369{ 371{
370 struct kstat stat; 372 struct kstat stat;
371 int error = vfs_stat(filename, &stat); 373 int error = vfs_stat(filename, &stat);
@@ -375,7 +377,8 @@ asmlinkage long sys_stat64(char __user * filename, struct stat64 __user * statbu
375 377
376 return error; 378 return error;
377} 379}
378asmlinkage long sys_lstat64(char __user * filename, struct stat64 __user * statbuf) 380
381SYSCALL_DEFINE2(lstat64, char __user *, filename, struct stat64 __user *, statbuf)
379{ 382{
380 struct kstat stat; 383 struct kstat stat;
381 int error = vfs_lstat(filename, &stat); 384 int error = vfs_lstat(filename, &stat);
@@ -385,7 +388,8 @@ asmlinkage long sys_lstat64(char __user * filename, struct stat64 __user * statb
385 388
386 return error; 389 return error;
387} 390}
388asmlinkage long sys_fstat64(unsigned long fd, struct stat64 __user * statbuf) 391
392SYSCALL_DEFINE2(fstat64, unsigned long, fd, struct stat64 __user *, statbuf)
389{ 393{
390 struct kstat stat; 394 struct kstat stat;
391 int error = vfs_fstat(fd, &stat); 395 int error = vfs_fstat(fd, &stat);
@@ -396,8 +400,8 @@ asmlinkage long sys_fstat64(unsigned long fd, struct stat64 __user * statbuf)
396 return error; 400 return error;
397} 401}
398 402
399asmlinkage long sys_fstatat64(int dfd, char __user *filename, 403SYSCALL_DEFINE4(fstatat64, int, dfd, char __user *, filename,
400 struct stat64 __user *statbuf, int flag) 404 struct stat64 __user *, statbuf, int, flag)
401{ 405{
402 struct kstat stat; 406 struct kstat stat;
403 int error = -EINVAL; 407 int error = -EINVAL;
diff --git a/fs/super.c b/fs/super.c
index ddba069d7a99..645e5403f2a0 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -38,6 +38,7 @@
38#include <linux/kobject.h> 38#include <linux/kobject.h>
39#include <linux/mutex.h> 39#include <linux/mutex.h>
40#include <linux/file.h> 40#include <linux/file.h>
41#include <linux/async.h>
41#include <asm/uaccess.h> 42#include <asm/uaccess.h>
42#include "internal.h" 43#include "internal.h"
43 44
@@ -71,6 +72,7 @@ static struct super_block *alloc_super(struct file_system_type *type)
71 INIT_HLIST_HEAD(&s->s_anon); 72 INIT_HLIST_HEAD(&s->s_anon);
72 INIT_LIST_HEAD(&s->s_inodes); 73 INIT_LIST_HEAD(&s->s_inodes);
73 INIT_LIST_HEAD(&s->s_dentry_lru); 74 INIT_LIST_HEAD(&s->s_dentry_lru);
75 INIT_LIST_HEAD(&s->s_async_list);
74 init_rwsem(&s->s_umount); 76 init_rwsem(&s->s_umount);
75 mutex_init(&s->s_lock); 77 mutex_init(&s->s_lock);
76 lockdep_set_class(&s->s_umount, &type->s_umount_key); 78 lockdep_set_class(&s->s_umount, &type->s_umount_key);
@@ -289,11 +291,18 @@ void generic_shutdown_super(struct super_block *sb)
289{ 291{
290 const struct super_operations *sop = sb->s_op; 292 const struct super_operations *sop = sb->s_op;
291 293
294
292 if (sb->s_root) { 295 if (sb->s_root) {
293 shrink_dcache_for_umount(sb); 296 shrink_dcache_for_umount(sb);
294 fsync_super(sb); 297 fsync_super(sb);
295 lock_super(sb); 298 lock_super(sb);
296 sb->s_flags &= ~MS_ACTIVE; 299 sb->s_flags &= ~MS_ACTIVE;
300
301 /*
302 * wait for asynchronous fs operations to finish before going further
303 */
304 async_synchronize_full_special(&sb->s_async_list);
305
297 /* bad name - it should be evict_inodes() */ 306 /* bad name - it should be evict_inodes() */
298 invalidate_inodes(sb); 307 invalidate_inodes(sb);
299 lock_kernel(); 308 lock_kernel();
@@ -461,6 +470,7 @@ restart:
461 sb->s_count++; 470 sb->s_count++;
462 spin_unlock(&sb_lock); 471 spin_unlock(&sb_lock);
463 down_read(&sb->s_umount); 472 down_read(&sb->s_umount);
473 async_synchronize_full_special(&sb->s_async_list);
464 if (sb->s_root && (wait || sb->s_dirt)) 474 if (sb->s_root && (wait || sb->s_dirt))
465 sb->s_op->sync_fs(sb, wait); 475 sb->s_op->sync_fs(sb, wait);
466 up_read(&sb->s_umount); 476 up_read(&sb->s_umount);
@@ -534,7 +544,7 @@ rescan:
534 return NULL; 544 return NULL;
535} 545}
536 546
537asmlinkage long sys_ustat(unsigned dev, struct ustat __user * ubuf) 547SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf)
538{ 548{
539 struct super_block *s; 549 struct super_block *s;
540 struct ustat tmp; 550 struct ustat tmp;
@@ -800,6 +810,7 @@ int get_sb_bdev(struct file_system_type *fs_type,
800 } 810 }
801 811
802 s->s_flags |= MS_ACTIVE; 812 s->s_flags |= MS_ACTIVE;
813 bdev->bd_super = s;
803 } 814 }
804 815
805 return simple_set_mnt(mnt, s); 816 return simple_set_mnt(mnt, s);
@@ -819,6 +830,7 @@ void kill_block_super(struct super_block *sb)
819 struct block_device *bdev = sb->s_bdev; 830 struct block_device *bdev = sb->s_bdev;
820 fmode_t mode = sb->s_mode; 831 fmode_t mode = sb->s_mode;
821 832
833 bdev->bd_super = 0;
822 generic_shutdown_super(sb); 834 generic_shutdown_super(sb);
823 sync_blockdev(bdev); 835 sync_blockdev(bdev);
824 close_bdev_exclusive(bdev, mode); 836 close_bdev_exclusive(bdev, mode);
diff --git a/fs/sync.c b/fs/sync.c
index 2967562d416f..a16d53e5fe9d 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -36,7 +36,7 @@ static void do_sync(unsigned long wait)
36 laptop_sync_completion(); 36 laptop_sync_completion();
37} 37}
38 38
39asmlinkage long sys_sync(void) 39SYSCALL_DEFINE0(sync)
40{ 40{
41 do_sync(1); 41 do_sync(1);
42 return 0; 42 return 0;
@@ -75,14 +75,39 @@ int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
75 return ret; 75 return ret;
76} 76}
77 77
78long do_fsync(struct file *file, int datasync) 78/**
79 * vfs_fsync - perform a fsync or fdatasync on a file
80 * @file: file to sync
81 * @dentry: dentry of @file
82 * @data: only perform a fdatasync operation
83 *
84 * Write back data and metadata for @file to disk. If @datasync is
85 * set only metadata needed to access modified file data is written.
86 *
87 * In case this function is called from nfsd @file may be %NULL and
88 * only @dentry is set. This can only happen when the filesystem
89 * implements the export_operations API.
90 */
91int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
79{ 92{
80 int ret; 93 const struct file_operations *fop;
81 int err; 94 struct address_space *mapping;
82 struct address_space *mapping = file->f_mapping; 95 int err, ret;
83 96
84 if (!file->f_op || !file->f_op->fsync) { 97 /*
85 /* Why? We can still call filemap_fdatawrite */ 98 * Get mapping and operations from the file in case we have
99 * as file, or get the default values for them in case we
100 * don't have a struct file available. Damn nfsd..
101 */
102 if (file) {
103 mapping = file->f_mapping;
104 fop = file->f_op;
105 } else {
106 mapping = dentry->d_inode->i_mapping;
107 fop = dentry->d_inode->i_fop;
108 }
109
110 if (!fop || !fop->fsync) {
86 ret = -EINVAL; 111 ret = -EINVAL;
87 goto out; 112 goto out;
88 } 113 }
@@ -94,7 +119,7 @@ long do_fsync(struct file *file, int datasync)
94 * livelocks in fsync_buffers_list(). 119 * livelocks in fsync_buffers_list().
95 */ 120 */
96 mutex_lock(&mapping->host->i_mutex); 121 mutex_lock(&mapping->host->i_mutex);
97 err = file->f_op->fsync(file, file->f_path.dentry, datasync); 122 err = fop->fsync(file, dentry, datasync);
98 if (!ret) 123 if (!ret)
99 ret = err; 124 ret = err;
100 mutex_unlock(&mapping->host->i_mutex); 125 mutex_unlock(&mapping->host->i_mutex);
@@ -104,28 +129,29 @@ long do_fsync(struct file *file, int datasync)
104out: 129out:
105 return ret; 130 return ret;
106} 131}
132EXPORT_SYMBOL(vfs_fsync);
107 133
108static long __do_fsync(unsigned int fd, int datasync) 134static int do_fsync(unsigned int fd, int datasync)
109{ 135{
110 struct file *file; 136 struct file *file;
111 int ret = -EBADF; 137 int ret = -EBADF;
112 138
113 file = fget(fd); 139 file = fget(fd);
114 if (file) { 140 if (file) {
115 ret = do_fsync(file, datasync); 141 ret = vfs_fsync(file, file->f_path.dentry, datasync);
116 fput(file); 142 fput(file);
117 } 143 }
118 return ret; 144 return ret;
119} 145}
120 146
121asmlinkage long sys_fsync(unsigned int fd) 147SYSCALL_DEFINE1(fsync, unsigned int, fd)
122{ 148{
123 return __do_fsync(fd, 0); 149 return do_fsync(fd, 0);
124} 150}
125 151
126asmlinkage long sys_fdatasync(unsigned int fd) 152SYSCALL_DEFINE1(fdatasync, unsigned int, fd)
127{ 153{
128 return __do_fsync(fd, 1); 154 return do_fsync(fd, 1);
129} 155}
130 156
131/* 157/*
@@ -175,8 +201,8 @@ asmlinkage long sys_fdatasync(unsigned int fd)
175 * already-instantiated disk blocks, there are no guarantees here that the data 201 * already-instantiated disk blocks, there are no guarantees here that the data
176 * will be available after a crash. 202 * will be available after a crash.
177 */ 203 */
178asmlinkage long sys_sync_file_range(int fd, loff_t offset, loff_t nbytes, 204SYSCALL_DEFINE(sync_file_range)(int fd, loff_t offset, loff_t nbytes,
179 unsigned int flags) 205 unsigned int flags)
180{ 206{
181 int ret; 207 int ret;
182 struct file *file; 208 struct file *file;
@@ -236,14 +262,32 @@ out_put:
236out: 262out:
237 return ret; 263 return ret;
238} 264}
265#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
266asmlinkage long SyS_sync_file_range(long fd, loff_t offset, loff_t nbytes,
267 long flags)
268{
269 return SYSC_sync_file_range((int) fd, offset, nbytes,
270 (unsigned int) flags);
271}
272SYSCALL_ALIAS(sys_sync_file_range, SyS_sync_file_range);
273#endif
239 274
240/* It would be nice if people remember that not all the world's an i386 275/* It would be nice if people remember that not all the world's an i386
241 when they introduce new system calls */ 276 when they introduce new system calls */
242asmlinkage long sys_sync_file_range2(int fd, unsigned int flags, 277SYSCALL_DEFINE(sync_file_range2)(int fd, unsigned int flags,
243 loff_t offset, loff_t nbytes) 278 loff_t offset, loff_t nbytes)
244{ 279{
245 return sys_sync_file_range(fd, offset, nbytes, flags); 280 return sys_sync_file_range(fd, offset, nbytes, flags);
246} 281}
282#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
283asmlinkage long SyS_sync_file_range2(long fd, long flags,
284 loff_t offset, loff_t nbytes)
285{
286 return SYSC_sync_file_range2((int) fd, (unsigned int) flags,
287 offset, nbytes);
288}
289SYSCALL_ALIAS(sys_sync_file_range2, SyS_sync_file_range2);
290#endif
247 291
248/* 292/*
249 * `endbyte' is inclusive 293 * `endbyte' is inclusive
@@ -269,7 +313,7 @@ int do_sync_mapping_range(struct address_space *mapping, loff_t offset,
269 313
270 if (flags & SYNC_FILE_RANGE_WRITE) { 314 if (flags & SYNC_FILE_RANGE_WRITE) {
271 ret = __filemap_fdatawrite_range(mapping, offset, endbyte, 315 ret = __filemap_fdatawrite_range(mapping, offset, endbyte,
272 WB_SYNC_NONE); 316 WB_SYNC_ALL);
273 if (ret < 0) 317 if (ret < 0)
274 goto out; 318 goto out;
275 } 319 }
diff --git a/fs/sysfs/Kconfig b/fs/sysfs/Kconfig
new file mode 100644
index 000000000000..f4b67588b9d6
--- /dev/null
+++ b/fs/sysfs/Kconfig
@@ -0,0 +1,23 @@
1config SYSFS
2 bool "sysfs file system support" if EMBEDDED
3 default y
4 help
5 The sysfs filesystem is a virtual filesystem that the kernel uses to
6 export internal kernel objects, their attributes, and their
7 relationships to one another.
8
9 Users can use sysfs to ascertain useful information about the running
10 kernel, such as the devices the kernel has discovered on each bus and
11 which driver each is bound to. sysfs can also be used to tune devices
12 and other kernel subsystems.
13
14 Some system agents rely on the information in sysfs to operate.
15 /sbin/hotplug uses device and object attributes in sysfs to assist in
16 delegating policy decisions, like persistently naming devices.
17
18 sysfs is currently used by the block subsystem to mount the root
19 partition. If sysfs is disabled you must specify the boot device on
20 the kernel boot command line via its major and minor numbers. For
21 example, "root=03:01" for /dev/hda1.
22
23 Designers of embedded systems may wish to say N here to conserve space.
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index 66f6e58a7e4b..f2c478c3424e 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -63,6 +63,9 @@ read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off)
63 int count = min_t(size_t, bytes, PAGE_SIZE); 63 int count = min_t(size_t, bytes, PAGE_SIZE);
64 char *temp; 64 char *temp;
65 65
66 if (!bytes)
67 return 0;
68
66 if (size) { 69 if (size) {
67 if (offs > size) 70 if (offs > size)
68 return 0; 71 return 0;
@@ -131,6 +134,9 @@ static ssize_t write(struct file *file, const char __user *userbuf,
131 int count = min_t(size_t, bytes, PAGE_SIZE); 134 int count = min_t(size_t, bytes, PAGE_SIZE);
132 char *temp; 135 char *temp;
133 136
137 if (!bytes)
138 return 0;
139
134 if (size) { 140 if (size) {
135 if (offs > size) 141 if (offs > size)
136 return 0; 142 return 0;
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index eb53c632f856..dfa3d94cfc74 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -107,8 +107,6 @@ int sysfs_setattr(struct dentry * dentry, struct iattr * iattr)
107static inline void set_default_inode_attr(struct inode * inode, mode_t mode) 107static inline void set_default_inode_attr(struct inode * inode, mode_t mode)
108{ 108{
109 inode->i_mode = mode; 109 inode->i_mode = mode;
110 inode->i_uid = 0;
111 inode->i_gid = 0;
112 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 110 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
113} 111}
114 112
@@ -149,7 +147,6 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
149{ 147{
150 struct bin_attribute *bin_attr; 148 struct bin_attribute *bin_attr;
151 149
152 inode->i_blocks = 0;
153 inode->i_mapping->a_ops = &sysfs_aops; 150 inode->i_mapping->a_ops = &sysfs_aops;
154 inode->i_mapping->backing_dev_info = &sysfs_backing_dev_info; 151 inode->i_mapping->backing_dev_info = &sysfs_backing_dev_info;
155 inode->i_op = &sysfs_inode_operations; 152 inode->i_op = &sysfs_inode_operations;
diff --git a/fs/sysv/Kconfig b/fs/sysv/Kconfig
new file mode 100644
index 000000000000..33aeb4b75db1
--- /dev/null
+++ b/fs/sysv/Kconfig
@@ -0,0 +1,36 @@
1config SYSV_FS
2 tristate "System V/Xenix/V7/Coherent file system support"
3 depends on BLOCK
4 help
5 SCO, Xenix and Coherent are commercial Unix systems for Intel
6 machines, and Version 7 was used on the DEC PDP-11. Saying Y
7 here would allow you to read from their floppies and hard disk
8 partitions.
9
10 If you have floppies or hard disk partitions like that, it is likely
11 that they contain binaries from those other Unix systems; in order
12 to run these binaries, you will want to install linux-abi which is
13 a set of kernel modules that lets you run SCO, Xenix, Wyse,
14 UnixWare, Dell Unix and System V programs under Linux. It is
15 available via FTP (user: ftp) from
16 <ftp://ftp.openlinux.org/pub/people/hch/linux-abi/>).
17 NOTE: that will work only for binaries from Intel-based systems;
18 PDP ones will have to wait until somebody ports Linux to -11 ;-)
19
20 If you only intend to mount files from some other Unix over the
21 network using NFS, you don't need the System V file system support
22 (but you need NFS file system support obviously).
23
24 Note that this option is generally not needed for floppies, since a
25 good portable way to transport files and directories between unixes
26 (and even other operating systems) is given by the tar program ("man
27 tar" or preferably "info tar"). Note also that this option has
28 nothing whatsoever to do with the option "System V IPC". Read about
29 the System V file system in
30 <file:Documentation/filesystems/sysv-fs.txt>.
31 Saying Y here will enlarge your kernel by about 27 KB.
32
33 To compile this as a module, choose M here: the module will be called
34 sysv.
35
36 If you haven't heard about all of this before, it's safe to say N.
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index df0d435baa48..3d81bf58dae2 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -27,6 +27,7 @@
27#include <linux/init.h> 27#include <linux/init.h>
28#include <linux/buffer_head.h> 28#include <linux/buffer_head.h>
29#include <linux/vfs.h> 29#include <linux/vfs.h>
30#include <linux/namei.h>
30#include <asm/byteorder.h> 31#include <asm/byteorder.h>
31#include "sysv.h" 32#include "sysv.h"
32 33
@@ -163,8 +164,11 @@ void sysv_set_inode(struct inode *inode, dev_t rdev)
163 if (inode->i_blocks) { 164 if (inode->i_blocks) {
164 inode->i_op = &sysv_symlink_inode_operations; 165 inode->i_op = &sysv_symlink_inode_operations;
165 inode->i_mapping->a_ops = &sysv_aops; 166 inode->i_mapping->a_ops = &sysv_aops;
166 } else 167 } else {
167 inode->i_op = &sysv_fast_symlink_inode_operations; 168 inode->i_op = &sysv_fast_symlink_inode_operations;
169 nd_terminate_link(SYSV_I(inode)->i_data, inode->i_size,
170 sizeof(SYSV_I(inode)->i_data) - 1);
171 }
168 } else 172 } else
169 init_special_inode(inode, inode->i_mode, rdev); 173 init_special_inode(inode, inode->i_mode, rdev);
170} 174}
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 0862f0e49d0c..6a123b8ff3f5 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -177,7 +177,7 @@ static struct file *timerfd_fget(int fd)
177 return file; 177 return file;
178} 178}
179 179
180asmlinkage long sys_timerfd_create(int clockid, int flags) 180SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
181{ 181{
182 int ufd; 182 int ufd;
183 struct timerfd_ctx *ctx; 183 struct timerfd_ctx *ctx;
@@ -208,9 +208,9 @@ asmlinkage long sys_timerfd_create(int clockid, int flags)
208 return ufd; 208 return ufd;
209} 209}
210 210
211asmlinkage long sys_timerfd_settime(int ufd, int flags, 211SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
212 const struct itimerspec __user *utmr, 212 const struct itimerspec __user *, utmr,
213 struct itimerspec __user *otmr) 213 struct itimerspec __user *, otmr)
214{ 214{
215 struct file *file; 215 struct file *file;
216 struct timerfd_ctx *ctx; 216 struct timerfd_ctx *ctx;
@@ -265,7 +265,7 @@ asmlinkage long sys_timerfd_settime(int ufd, int flags,
265 return 0; 265 return 0;
266} 266}
267 267
268asmlinkage long sys_timerfd_gettime(int ufd, struct itimerspec __user *otmr) 268SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct itimerspec __user *, otmr)
269{ 269{
270 struct file *file; 270 struct file *file;
271 struct timerfd_ctx *ctx; 271 struct timerfd_ctx *ctx;
diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig
index 91ceeda7e5bf..e35b54d5059d 100644
--- a/fs/ubifs/Kconfig
+++ b/fs/ubifs/Kconfig
@@ -40,7 +40,7 @@ config UBIFS_FS_ZLIB
40 depends on UBIFS_FS 40 depends on UBIFS_FS
41 default y 41 default y
42 help 42 help
43 Zlib copresses better then LZO but it is slower. Say 'Y' if unsure. 43 Zlib compresses better than LZO but it is slower. Say 'Y' if unsure.
44 44
45# Debugging-related stuff 45# Debugging-related stuff
46config UBIFS_FS_DEBUG 46config UBIFS_FS_DEBUG
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index 4a18f084cc42..f393620890ee 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -32,18 +32,15 @@
32 32
33#include "ubifs.h" 33#include "ubifs.h"
34#include <linux/writeback.h> 34#include <linux/writeback.h>
35#include <asm/div64.h> 35#include <linux/math64.h>
36 36
37/* 37/*
38 * When pessimistic budget calculations say that there is no enough space, 38 * When pessimistic budget calculations say that there is no enough space,
39 * UBIFS starts writing back dirty inodes and pages, doing garbage collection, 39 * UBIFS starts writing back dirty inodes and pages, doing garbage collection,
40 * or committing. The below constants define maximum number of times UBIFS 40 * or committing. The below constant defines maximum number of times UBIFS
41 * repeats the operations. 41 * repeats the operations.
42 */ 42 */
43#define MAX_SHRINK_RETRIES 8 43#define MAX_MKSPC_RETRIES 3
44#define MAX_GC_RETRIES 4
45#define MAX_CMT_RETRIES 2
46#define MAX_NOSPC_RETRIES 1
47 44
48/* 45/*
49 * The below constant defines amount of dirty pages which should be written 46 * The below constant defines amount of dirty pages which should be written
@@ -52,30 +49,6 @@
52#define NR_TO_WRITE 16 49#define NR_TO_WRITE 16
53 50
54/** 51/**
55 * struct retries_info - information about re-tries while making free space.
56 * @prev_liability: previous liability
57 * @shrink_cnt: how many times the liability was shrinked
58 * @shrink_retries: count of liability shrink re-tries (increased when
59 * liability does not shrink)
60 * @try_gc: GC should be tried first
61 * @gc_retries: how many times GC was run
62 * @cmt_retries: how many times commit has been done
63 * @nospc_retries: how many times GC returned %-ENOSPC
64 *
65 * Since we consider budgeting to be the fast-path, and this structure has to
66 * be allocated on stack and zeroed out, we make it smaller using bit-fields.
67 */
68struct retries_info {
69 long long prev_liability;
70 unsigned int shrink_cnt;
71 unsigned int shrink_retries:5;
72 unsigned int try_gc:1;
73 unsigned int gc_retries:4;
74 unsigned int cmt_retries:3;
75 unsigned int nospc_retries:1;
76};
77
78/**
79 * shrink_liability - write-back some dirty pages/inodes. 52 * shrink_liability - write-back some dirty pages/inodes.
80 * @c: UBIFS file-system description object 53 * @c: UBIFS file-system description object
81 * @nr_to_write: how many dirty pages to write-back 54 * @nr_to_write: how many dirty pages to write-back
@@ -147,13 +120,29 @@ static int run_gc(struct ubifs_info *c)
147} 120}
148 121
149/** 122/**
123 * get_liability - calculate current liability.
124 * @c: UBIFS file-system description object
125 *
126 * This function calculates and returns current UBIFS liability, i.e. the
127 * amount of bytes UBIFS has "promised" to write to the media.
128 */
129static long long get_liability(struct ubifs_info *c)
130{
131 long long liab;
132
133 spin_lock(&c->space_lock);
134 liab = c->budg_idx_growth + c->budg_data_growth + c->budg_dd_growth;
135 spin_unlock(&c->space_lock);
136 return liab;
137}
138
139/**
150 * make_free_space - make more free space on the file-system. 140 * make_free_space - make more free space on the file-system.
151 * @c: UBIFS file-system description object 141 * @c: UBIFS file-system description object
152 * @ri: information about previous invocations of this function
153 * 142 *
154 * This function is called when an operation cannot be budgeted because there 143 * This function is called when an operation cannot be budgeted because there
155 * is supposedly no free space. But in most cases there is some free space: 144 * is supposedly no free space. But in most cases there is some free space:
156 * o budgeting is pessimistic, so it always budgets more then it is actually 145 * o budgeting is pessimistic, so it always budgets more than it is actually
157 * needed, so shrinking the liability is one way to make free space - the 146 * needed, so shrinking the liability is one way to make free space - the
158 * cached data will take less space then it was budgeted for; 147 * cached data will take less space then it was budgeted for;
159 * o GC may turn some dark space into free space (budgeting treats dark space 148 * o GC may turn some dark space into free space (budgeting treats dark space
@@ -165,87 +154,42 @@ static int run_gc(struct ubifs_info *c)
165 * Returns %-ENOSPC if it couldn't do more free space, and other negative error 154 * Returns %-ENOSPC if it couldn't do more free space, and other negative error
166 * codes on failures. 155 * codes on failures.
167 */ 156 */
168static int make_free_space(struct ubifs_info *c, struct retries_info *ri) 157static int make_free_space(struct ubifs_info *c)
169{ 158{
170 int err; 159 int err, retries = 0;
160 long long liab1, liab2;
171 161
172 /* 162 do {
173 * If we have some dirty pages and inodes (liability), try to write 163 liab1 = get_liability(c);
174 * them back unless this was tried too many times without effect 164 /*
175 * already. 165 * We probably have some dirty pages or inodes (liability), try
176 */ 166 * to write them back.
177 if (ri->shrink_retries < MAX_SHRINK_RETRIES && !ri->try_gc) { 167 */
178 long long liability; 168 dbg_budg("liability %lld, run write-back", liab1);
179 169 shrink_liability(c, NR_TO_WRITE);
180 spin_lock(&c->space_lock);
181 liability = c->budg_idx_growth + c->budg_data_growth +
182 c->budg_dd_growth;
183 spin_unlock(&c->space_lock);
184
185 if (ri->prev_liability >= liability) {
186 /* Liability does not shrink, next time try GC then */
187 ri->shrink_retries += 1;
188 if (ri->gc_retries < MAX_GC_RETRIES)
189 ri->try_gc = 1;
190 dbg_budg("liability did not shrink: retries %d of %d",
191 ri->shrink_retries, MAX_SHRINK_RETRIES);
192 }
193
194 dbg_budg("force write-back (count %d)", ri->shrink_cnt);
195 shrink_liability(c, NR_TO_WRITE + ri->shrink_cnt);
196 170
197 ri->prev_liability = liability; 171 liab2 = get_liability(c);
198 ri->shrink_cnt += 1; 172 if (liab2 < liab1)
199 return -EAGAIN; 173 return -EAGAIN;
200 }
201 174
202 /* 175 dbg_budg("new liability %lld (not shrinked)", liab2);
203 * Try to run garbage collector unless it was already tried too many
204 * times.
205 */
206 if (ri->gc_retries < MAX_GC_RETRIES) {
207 ri->gc_retries += 1;
208 dbg_budg("run GC, retries %d of %d",
209 ri->gc_retries, MAX_GC_RETRIES);
210 176
211 ri->try_gc = 0; 177 /* Liability did not shrink again, try GC */
178 dbg_budg("Run GC");
212 err = run_gc(c); 179 err = run_gc(c);
213 if (!err) 180 if (!err)
214 return -EAGAIN; 181 return -EAGAIN;
215 182
216 if (err == -EAGAIN) { 183 if (err != -EAGAIN && err != -ENOSPC)
217 dbg_budg("GC asked to commit"); 184 /* Some real error happened */
218 err = ubifs_run_commit(c);
219 if (err)
220 return err;
221 return -EAGAIN;
222 }
223
224 if (err != -ENOSPC)
225 return err;
226
227 /*
228 * GC could not make any progress. If this is the first time,
229 * then it makes sense to try to commit, because it might make
230 * some dirty space.
231 */
232 dbg_budg("GC returned -ENOSPC, retries %d",
233 ri->nospc_retries);
234 if (ri->nospc_retries >= MAX_NOSPC_RETRIES)
235 return err; 185 return err;
236 ri->nospc_retries += 1;
237 }
238 186
239 /* Neither GC nor write-back helped, try to commit */ 187 dbg_budg("Run commit (retries %d)", retries);
240 if (ri->cmt_retries < MAX_CMT_RETRIES) {
241 ri->cmt_retries += 1;
242 dbg_budg("run commit, retries %d of %d",
243 ri->cmt_retries, MAX_CMT_RETRIES);
244 err = ubifs_run_commit(c); 188 err = ubifs_run_commit(c);
245 if (err) 189 if (err)
246 return err; 190 return err;
247 return -EAGAIN; 191 } while (retries++ < MAX_MKSPC_RETRIES);
248 } 192
249 return -ENOSPC; 193 return -ENOSPC;
250} 194}
251 195
@@ -258,8 +202,8 @@ static int make_free_space(struct ubifs_info *c, struct retries_info *ri)
258 */ 202 */
259int ubifs_calc_min_idx_lebs(struct ubifs_info *c) 203int ubifs_calc_min_idx_lebs(struct ubifs_info *c)
260{ 204{
261 int ret; 205 int idx_lebs, eff_leb_size = c->leb_size - c->max_idx_node_sz;
262 uint64_t idx_size; 206 long long idx_size;
263 207
264 idx_size = c->old_idx_sz + c->budg_idx_growth + c->budg_uncommitted_idx; 208 idx_size = c->old_idx_sz + c->budg_idx_growth + c->budg_uncommitted_idx;
265 209
@@ -271,23 +215,16 @@ int ubifs_calc_min_idx_lebs(struct ubifs_info *c)
271 * pair, nor similarly the two variables for the new index size, so we 215 * pair, nor similarly the two variables for the new index size, so we
272 * have to do this costly 64-bit division on fast-path. 216 * have to do this costly 64-bit division on fast-path.
273 */ 217 */
274 if (do_div(idx_size, c->leb_size - c->max_idx_node_sz)) 218 idx_size += eff_leb_size - 1;
275 ret = idx_size + 1; 219 idx_lebs = div_u64(idx_size, eff_leb_size);
276 else
277 ret = idx_size;
278 /* 220 /*
279 * The index head is not available for the in-the-gaps method, so add an 221 * The index head is not available for the in-the-gaps method, so add an
280 * extra LEB to compensate. 222 * extra LEB to compensate.
281 */ 223 */
282 ret += 1; 224 idx_lebs += 1;
283 /* 225 if (idx_lebs < MIN_INDEX_LEBS)
284 * At present the index needs at least 2 LEBs: one for the index head 226 idx_lebs = MIN_INDEX_LEBS;
285 * and one for in-the-gaps method (which currently does not cater for 227 return idx_lebs;
286 * the index head and so excludes it from consideration).
287 */
288 if (ret < 2)
289 ret = 2;
290 return ret;
291} 228}
292 229
293/** 230/**
@@ -530,8 +467,7 @@ static int calc_dd_growth(const struct ubifs_info *c,
530int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req) 467int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req)
531{ 468{
532 int uninitialized_var(cmt_retries), uninitialized_var(wb_retries); 469 int uninitialized_var(cmt_retries), uninitialized_var(wb_retries);
533 int err, idx_growth, data_growth, dd_growth; 470 int err, idx_growth, data_growth, dd_growth, retried = 0;
534 struct retries_info ri;
535 471
536 ubifs_assert(req->new_page <= 1); 472 ubifs_assert(req->new_page <= 1);
537 ubifs_assert(req->dirtied_page <= 1); 473 ubifs_assert(req->dirtied_page <= 1);
@@ -549,7 +485,6 @@ int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req)
549 if (!data_growth && !dd_growth) 485 if (!data_growth && !dd_growth)
550 return 0; 486 return 0;
551 idx_growth = calc_idx_growth(c, req); 487 idx_growth = calc_idx_growth(c, req);
552 memset(&ri, 0, sizeof(struct retries_info));
553 488
554again: 489again:
555 spin_lock(&c->space_lock); 490 spin_lock(&c->space_lock);
@@ -587,12 +522,17 @@ again:
587 return err; 522 return err;
588 } 523 }
589 524
590 err = make_free_space(c, &ri); 525 err = make_free_space(c);
526 cond_resched();
591 if (err == -EAGAIN) { 527 if (err == -EAGAIN) {
592 dbg_budg("try again"); 528 dbg_budg("try again");
593 cond_resched();
594 goto again; 529 goto again;
595 } else if (err == -ENOSPC) { 530 } else if (err == -ENOSPC) {
531 if (!retried) {
532 retried = 1;
533 dbg_budg("-ENOSPC, but anyway try once again");
534 goto again;
535 }
596 dbg_budg("FS is full, -ENOSPC"); 536 dbg_budg("FS is full, -ENOSPC");
597 c->nospace = 1; 537 c->nospace = 1;
598 if (can_use_rp(c) || c->rp_size == 0) 538 if (can_use_rp(c) || c->rp_size == 0)
@@ -666,7 +606,7 @@ void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req)
666 * @c: UBIFS file-system description object 606 * @c: UBIFS file-system description object
667 * 607 *
668 * This function converts budget which was allocated for a new page of data to 608 * This function converts budget which was allocated for a new page of data to
669 * the budget of changing an existing page of data. The latter is smaller then 609 * the budget of changing an existing page of data. The latter is smaller than
670 * the former, so this function only does simple re-calculation and does not 610 * the former, so this function only does simple re-calculation and does not
671 * involve any write-back. 611 * involve any write-back.
672 */ 612 */
@@ -712,9 +652,9 @@ void ubifs_release_dirty_inode_budget(struct ubifs_info *c,
712 * user-space. User-space application tend to expect that if the file-system 652 * user-space. User-space application tend to expect that if the file-system
713 * (e.g., via the 'statfs()' call) reports that it has N bytes available, they 653 * (e.g., via the 'statfs()' call) reports that it has N bytes available, they
714 * are able to write a file of size N. UBIFS attaches node headers to each data 654 * are able to write a file of size N. UBIFS attaches node headers to each data
715 * node and it has to write indexind nodes as well. This introduces additional 655 * node and it has to write indexing nodes as well. This introduces additional
716 * overhead, and UBIFS it has to report sligtly less free space to meet the 656 * overhead, and UBIFS has to report slightly less free space to meet the above
717 * above expectetion. 657 * expectations.
718 * 658 *
719 * This function assumes free space is made up of uncompressed data nodes and 659 * This function assumes free space is made up of uncompressed data nodes and
720 * full index nodes (one per data node, tripled because we always allow enough 660 * full index nodes (one per data node, tripled because we always allow enough
@@ -723,7 +663,7 @@ void ubifs_release_dirty_inode_budget(struct ubifs_info *c,
723 * Note, the calculation is pessimistic, which means that most of the time 663 * Note, the calculation is pessimistic, which means that most of the time
724 * UBIFS reports less space than it actually has. 664 * UBIFS reports less space than it actually has.
725 */ 665 */
726long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free) 666long long ubifs_reported_space(const struct ubifs_info *c, long long free)
727{ 667{
728 int divisor, factor, f; 668 int divisor, factor, f;
729 669
@@ -737,7 +677,7 @@ long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free)
737 * of data nodes, f - fanout. Because effective UBIFS fanout is twice 677 * of data nodes, f - fanout. Because effective UBIFS fanout is twice
738 * as less than maximum fanout, we assume that each data node 678 * as less than maximum fanout, we assume that each data node
739 * introduces 3 * @c->max_idx_node_sz / (@c->fanout/2 - 1) bytes. 679 * introduces 3 * @c->max_idx_node_sz / (@c->fanout/2 - 1) bytes.
740 * Note, the multiplier 3 is because UBIFS reseves thrice as more space 680 * Note, the multiplier 3 is because UBIFS reserves thrice as more space
741 * for the index. 681 * for the index.
742 */ 682 */
743 f = c->fanout > 3 ? c->fanout >> 1 : 2; 683 f = c->fanout > 3 ? c->fanout >> 1 : 2;
@@ -745,45 +685,33 @@ long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free)
745 divisor = UBIFS_MAX_DATA_NODE_SZ; 685 divisor = UBIFS_MAX_DATA_NODE_SZ;
746 divisor += (c->max_idx_node_sz * 3) / (f - 1); 686 divisor += (c->max_idx_node_sz * 3) / (f - 1);
747 free *= factor; 687 free *= factor;
748 do_div(free, divisor); 688 return div_u64(free, divisor);
749 return free;
750} 689}
751 690
752/** 691/**
753 * ubifs_get_free_space - return amount of free space. 692 * ubifs_get_free_space_nolock - return amount of free space.
754 * @c: UBIFS file-system description object 693 * @c: UBIFS file-system description object
755 * 694 *
756 * This function calculates amount of free space to report to user-space. 695 * This function calculates amount of free space to report to user-space.
757 * 696 *
758 * Because UBIFS may introduce substantial overhead (the index, node headers, 697 * Because UBIFS may introduce substantial overhead (the index, node headers,
759 * alighment, wastage at the end of eraseblocks, etc), it cannot report real 698 * alignment, wastage at the end of eraseblocks, etc), it cannot report real
760 * amount of free flash space it has (well, because not all dirty space is 699 * amount of free flash space it has (well, because not all dirty space is
761 * reclamable, UBIFS does not actually know the real amount). If UBIFS did so, 700 * reclaimable, UBIFS does not actually know the real amount). If UBIFS did so,
762 * it would bread user expectetion about what free space is. Users seem to 701 * it would bread user expectations about what free space is. Users seem to
763 * accustomed to assume that if the file-system reports N bytes of free space, 702 * accustomed to assume that if the file-system reports N bytes of free space,
764 * they would be able to fit a file of N bytes to the FS. This almost works for 703 * they would be able to fit a file of N bytes to the FS. This almost works for
765 * traditional file-systems, because they have way less overhead than UBIFS. 704 * traditional file-systems, because they have way less overhead than UBIFS.
766 * So, to keep users happy, UBIFS tries to take the overhead into account. 705 * So, to keep users happy, UBIFS tries to take the overhead into account.
767 */ 706 */
768long long ubifs_get_free_space(struct ubifs_info *c) 707long long ubifs_get_free_space_nolock(struct ubifs_info *c)
769{ 708{
770 int min_idx_lebs, rsvd_idx_lebs, lebs; 709 int rsvd_idx_lebs, lebs;
771 long long available, outstanding, free; 710 long long available, outstanding, free;
772 711
773 spin_lock(&c->space_lock); 712 ubifs_assert(c->min_idx_lebs == ubifs_calc_min_idx_lebs(c));
774 min_idx_lebs = ubifs_calc_min_idx_lebs(c);
775 outstanding = c->budg_data_growth + c->budg_dd_growth; 713 outstanding = c->budg_data_growth + c->budg_dd_growth;
776 714 available = ubifs_calc_available(c, c->min_idx_lebs);
777 /*
778 * Force the amount available to the total size reported if the used
779 * space is zero.
780 */
781 if (c->lst.total_used <= UBIFS_INO_NODE_SZ && !outstanding) {
782 spin_unlock(&c->space_lock);
783 return (long long)c->block_cnt << UBIFS_BLOCK_SHIFT;
784 }
785
786 available = ubifs_calc_available(c, min_idx_lebs);
787 715
788 /* 716 /*
789 * When reporting free space to user-space, UBIFS guarantees that it is 717 * When reporting free space to user-space, UBIFS guarantees that it is
@@ -796,15 +724,14 @@ long long ubifs_get_free_space(struct ubifs_info *c)
796 * Note, the calculations below are similar to what we have in 724 * Note, the calculations below are similar to what we have in
797 * 'do_budget_space()', so refer there for comments. 725 * 'do_budget_space()', so refer there for comments.
798 */ 726 */
799 if (min_idx_lebs > c->lst.idx_lebs) 727 if (c->min_idx_lebs > c->lst.idx_lebs)
800 rsvd_idx_lebs = min_idx_lebs - c->lst.idx_lebs; 728 rsvd_idx_lebs = c->min_idx_lebs - c->lst.idx_lebs;
801 else 729 else
802 rsvd_idx_lebs = 0; 730 rsvd_idx_lebs = 0;
803 lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt - 731 lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt -
804 c->lst.taken_empty_lebs; 732 c->lst.taken_empty_lebs;
805 lebs -= rsvd_idx_lebs; 733 lebs -= rsvd_idx_lebs;
806 available += lebs * (c->dark_wm - c->leb_overhead); 734 available += lebs * (c->dark_wm - c->leb_overhead);
807 spin_unlock(&c->space_lock);
808 735
809 if (available > outstanding) 736 if (available > outstanding)
810 free = ubifs_reported_space(c, available - outstanding); 737 free = ubifs_reported_space(c, available - outstanding);
@@ -812,3 +739,21 @@ long long ubifs_get_free_space(struct ubifs_info *c)
812 free = 0; 739 free = 0;
813 return free; 740 return free;
814} 741}
742
743/**
744 * ubifs_get_free_space - return amount of free space.
745 * @c: UBIFS file-system description object
746 *
747 * This function calculates and retuns amount of free space to report to
748 * user-space.
749 */
750long long ubifs_get_free_space(struct ubifs_info *c)
751{
752 long long free;
753
754 spin_lock(&c->space_lock);
755 free = ubifs_get_free_space_nolock(c);
756 spin_unlock(&c->space_lock);
757
758 return free;
759}
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index b49884c8c10e..f3a7945527fb 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -470,12 +470,12 @@ int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot)
470{ 470{
471 struct ubifs_idx_node *idx; 471 struct ubifs_idx_node *idx;
472 int lnum, offs, len, err = 0; 472 int lnum, offs, len, err = 0;
473 struct ubifs_debug_info *d = c->dbg;
473 474
474 c->old_zroot = *zroot; 475 d->old_zroot = *zroot;
475 476 lnum = d->old_zroot.lnum;
476 lnum = c->old_zroot.lnum; 477 offs = d->old_zroot.offs;
477 offs = c->old_zroot.offs; 478 len = d->old_zroot.len;
478 len = c->old_zroot.len;
479 479
480 idx = kmalloc(c->max_idx_node_sz, GFP_NOFS); 480 idx = kmalloc(c->max_idx_node_sz, GFP_NOFS);
481 if (!idx) 481 if (!idx)
@@ -485,8 +485,8 @@ int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot)
485 if (err) 485 if (err)
486 goto out; 486 goto out;
487 487
488 c->old_zroot_level = le16_to_cpu(idx->level); 488 d->old_zroot_level = le16_to_cpu(idx->level);
489 c->old_zroot_sqnum = le64_to_cpu(idx->ch.sqnum); 489 d->old_zroot_sqnum = le64_to_cpu(idx->ch.sqnum);
490out: 490out:
491 kfree(idx); 491 kfree(idx);
492 return err; 492 return err;
@@ -509,6 +509,7 @@ int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot)
509{ 509{
510 int lnum, offs, len, err = 0, uninitialized_var(last_level), child_cnt; 510 int lnum, offs, len, err = 0, uninitialized_var(last_level), child_cnt;
511 int first = 1, iip; 511 int first = 1, iip;
512 struct ubifs_debug_info *d = c->dbg;
512 union ubifs_key lower_key, upper_key, l_key, u_key; 513 union ubifs_key lower_key, upper_key, l_key, u_key;
513 unsigned long long uninitialized_var(last_sqnum); 514 unsigned long long uninitialized_var(last_sqnum);
514 struct ubifs_idx_node *idx; 515 struct ubifs_idx_node *idx;
@@ -525,9 +526,9 @@ int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot)
525 UBIFS_IDX_NODE_SZ; 526 UBIFS_IDX_NODE_SZ;
526 527
527 /* Start at the old zroot */ 528 /* Start at the old zroot */
528 lnum = c->old_zroot.lnum; 529 lnum = d->old_zroot.lnum;
529 offs = c->old_zroot.offs; 530 offs = d->old_zroot.offs;
530 len = c->old_zroot.len; 531 len = d->old_zroot.len;
531 iip = 0; 532 iip = 0;
532 533
533 /* 534 /*
@@ -560,11 +561,11 @@ int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot)
560 if (first) { 561 if (first) {
561 first = 0; 562 first = 0;
562 /* Check root level and sqnum */ 563 /* Check root level and sqnum */
563 if (le16_to_cpu(idx->level) != c->old_zroot_level) { 564 if (le16_to_cpu(idx->level) != d->old_zroot_level) {
564 err = 2; 565 err = 2;
565 goto out_dump; 566 goto out_dump;
566 } 567 }
567 if (le64_to_cpu(idx->ch.sqnum) != c->old_zroot_sqnum) { 568 if (le64_to_cpu(idx->ch.sqnum) != d->old_zroot_sqnum) {
568 err = 3; 569 err = 3;
569 goto out_dump; 570 goto out_dump;
570 } 571 }
diff --git a/fs/ubifs/compress.c b/fs/ubifs/compress.c
index a0ada596b17c..11e4132f314a 100644
--- a/fs/ubifs/compress.c
+++ b/fs/ubifs/compress.c
@@ -33,7 +33,7 @@
33/* Fake description object for the "none" compressor */ 33/* Fake description object for the "none" compressor */
34static struct ubifs_compressor none_compr = { 34static struct ubifs_compressor none_compr = {
35 .compr_type = UBIFS_COMPR_NONE, 35 .compr_type = UBIFS_COMPR_NONE,
36 .name = "no compression", 36 .name = "none",
37 .capi_name = "", 37 .capi_name = "",
38}; 38};
39 39
@@ -43,13 +43,13 @@ static DEFINE_MUTEX(lzo_mutex);
43static struct ubifs_compressor lzo_compr = { 43static struct ubifs_compressor lzo_compr = {
44 .compr_type = UBIFS_COMPR_LZO, 44 .compr_type = UBIFS_COMPR_LZO,
45 .comp_mutex = &lzo_mutex, 45 .comp_mutex = &lzo_mutex,
46 .name = "LZO", 46 .name = "lzo",
47 .capi_name = "lzo", 47 .capi_name = "lzo",
48}; 48};
49#else 49#else
50static struct ubifs_compressor lzo_compr = { 50static struct ubifs_compressor lzo_compr = {
51 .compr_type = UBIFS_COMPR_LZO, 51 .compr_type = UBIFS_COMPR_LZO,
52 .name = "LZO", 52 .name = "lzo",
53}; 53};
54#endif 54#endif
55 55
@@ -108,7 +108,7 @@ void ubifs_compress(const void *in_buf, int in_len, void *out_buf, int *out_len,
108 if (compr->comp_mutex) 108 if (compr->comp_mutex)
109 mutex_lock(compr->comp_mutex); 109 mutex_lock(compr->comp_mutex);
110 err = crypto_comp_compress(compr->cc, in_buf, in_len, out_buf, 110 err = crypto_comp_compress(compr->cc, in_buf, in_len, out_buf,
111 out_len); 111 (unsigned int *)out_len);
112 if (compr->comp_mutex) 112 if (compr->comp_mutex)
113 mutex_unlock(compr->comp_mutex); 113 mutex_unlock(compr->comp_mutex);
114 if (unlikely(err)) { 114 if (unlikely(err)) {
@@ -119,10 +119,10 @@ void ubifs_compress(const void *in_buf, int in_len, void *out_buf, int *out_len,
119 } 119 }
120 120
121 /* 121 /*
122 * Presently, we just require that compression results in less data, 122 * If the data compressed only slightly, it is better to leave it
123 * rather than any defined minimum compression ratio or amount. 123 * uncompressed to improve read speed.
124 */ 124 */
125 if (ALIGN(*out_len, 8) >= ALIGN(in_len, 8)) 125 if (in_len - *out_len < UBIFS_MIN_COMPRESS_DIFF)
126 goto no_compr; 126 goto no_compr;
127 127
128 return; 128 return;
@@ -172,7 +172,7 @@ int ubifs_decompress(const void *in_buf, int in_len, void *out_buf,
172 if (compr->decomp_mutex) 172 if (compr->decomp_mutex)
173 mutex_lock(compr->decomp_mutex); 173 mutex_lock(compr->decomp_mutex);
174 err = crypto_comp_decompress(compr->cc, in_buf, in_len, out_buf, 174 err = crypto_comp_decompress(compr->cc, in_buf, in_len, out_buf,
175 out_len); 175 (unsigned int *)out_len);
176 if (compr->decomp_mutex) 176 if (compr->decomp_mutex)
177 mutex_unlock(compr->decomp_mutex); 177 mutex_unlock(compr->decomp_mutex);
178 if (err) 178 if (err)
@@ -244,7 +244,7 @@ out_lzo:
244/** 244/**
245 * ubifs_compressors_exit - de-initialize UBIFS compressors. 245 * ubifs_compressors_exit - de-initialize UBIFS compressors.
246 */ 246 */
247void __exit ubifs_compressors_exit(void) 247void ubifs_compressors_exit(void)
248{ 248{
249 compr_exit(&lzo_compr); 249 compr_exit(&lzo_compr);
250 compr_exit(&zlib_compr); 250 compr_exit(&zlib_compr);
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 510ffa0bbda4..e975bd82f38b 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -32,6 +32,8 @@
32#include "ubifs.h" 32#include "ubifs.h"
33#include <linux/module.h> 33#include <linux/module.h>
34#include <linux/moduleparam.h> 34#include <linux/moduleparam.h>
35#include <linux/debugfs.h>
36#include <linux/math64.h>
35 37
36#ifdef CONFIG_UBIFS_FS_DEBUG 38#ifdef CONFIG_UBIFS_FS_DEBUG
37 39
@@ -596,7 +598,9 @@ void dbg_dump_budg(struct ubifs_info *c)
596 struct rb_node *rb; 598 struct rb_node *rb;
597 struct ubifs_bud *bud; 599 struct ubifs_bud *bud;
598 struct ubifs_gced_idx_leb *idx_gc; 600 struct ubifs_gced_idx_leb *idx_gc;
601 long long available, outstanding, free;
599 602
603 ubifs_assert(spin_is_locked(&c->space_lock));
600 spin_lock(&dbg_lock); 604 spin_lock(&dbg_lock);
601 printk(KERN_DEBUG "(pid %d) Budgeting info: budg_data_growth %lld, " 605 printk(KERN_DEBUG "(pid %d) Budgeting info: budg_data_growth %lld, "
602 "budg_dd_growth %lld, budg_idx_growth %lld\n", current->pid, 606 "budg_dd_growth %lld, budg_idx_growth %lld\n", current->pid,
@@ -616,9 +620,11 @@ void dbg_dump_budg(struct ubifs_info *c)
616 c->dark_wm, c->dead_wm, c->max_idx_node_sz); 620 c->dark_wm, c->dead_wm, c->max_idx_node_sz);
617 printk(KERN_DEBUG "\tgc_lnum %d, ihead_lnum %d\n", 621 printk(KERN_DEBUG "\tgc_lnum %d, ihead_lnum %d\n",
618 c->gc_lnum, c->ihead_lnum); 622 c->gc_lnum, c->ihead_lnum);
619 for (i = 0; i < c->jhead_cnt; i++) 623 /* If we are in R/O mode, journal heads do not exist */
620 printk(KERN_DEBUG "\tjhead %d\t LEB %d\n", 624 if (c->jheads)
621 c->jheads[i].wbuf.jhead, c->jheads[i].wbuf.lnum); 625 for (i = 0; i < c->jhead_cnt; i++)
626 printk(KERN_DEBUG "\tjhead %d\t LEB %d\n",
627 c->jheads[i].wbuf.jhead, c->jheads[i].wbuf.lnum);
622 for (rb = rb_first(&c->buds); rb; rb = rb_next(rb)) { 628 for (rb = rb_first(&c->buds); rb; rb = rb_next(rb)) {
623 bud = rb_entry(rb, struct ubifs_bud, rb); 629 bud = rb_entry(rb, struct ubifs_bud, rb);
624 printk(KERN_DEBUG "\tbud LEB %d\n", bud->lnum); 630 printk(KERN_DEBUG "\tbud LEB %d\n", bud->lnum);
@@ -629,6 +635,14 @@ void dbg_dump_budg(struct ubifs_info *c)
629 printk(KERN_DEBUG "\tGC'ed idx LEB %d unmap %d\n", 635 printk(KERN_DEBUG "\tGC'ed idx LEB %d unmap %d\n",
630 idx_gc->lnum, idx_gc->unmap); 636 idx_gc->lnum, idx_gc->unmap);
631 printk(KERN_DEBUG "\tcommit state %d\n", c->cmt_state); 637 printk(KERN_DEBUG "\tcommit state %d\n", c->cmt_state);
638
639 /* Print budgeting predictions */
640 available = ubifs_calc_available(c, c->min_idx_lebs);
641 outstanding = c->budg_data_growth + c->budg_dd_growth;
642 free = ubifs_get_free_space_nolock(c);
643 printk(KERN_DEBUG "Budgeting predictions:\n");
644 printk(KERN_DEBUG "\tavailable: %lld, outstanding %lld, free %lld\n",
645 available, outstanding, free);
632 spin_unlock(&dbg_lock); 646 spin_unlock(&dbg_lock);
633} 647}
634 648
@@ -645,7 +659,8 @@ void dbg_dump_lprops(struct ubifs_info *c)
645 struct ubifs_lprops lp; 659 struct ubifs_lprops lp;
646 struct ubifs_lp_stats lst; 660 struct ubifs_lp_stats lst;
647 661
648 printk(KERN_DEBUG "(pid %d) Dumping LEB properties\n", current->pid); 662 printk(KERN_DEBUG "(pid %d) start dumping LEB properties\n",
663 current->pid);
649 ubifs_get_lp_stats(c, &lst); 664 ubifs_get_lp_stats(c, &lst);
650 dbg_dump_lstats(&lst); 665 dbg_dump_lstats(&lst);
651 666
@@ -656,6 +671,8 @@ void dbg_dump_lprops(struct ubifs_info *c)
656 671
657 dbg_dump_lprop(c, &lp); 672 dbg_dump_lprop(c, &lp);
658 } 673 }
674 printk(KERN_DEBUG "(pid %d) finish dumping LEB properties\n",
675 current->pid);
659} 676}
660 677
661void dbg_dump_lpt_info(struct ubifs_info *c) 678void dbg_dump_lpt_info(struct ubifs_info *c)
@@ -663,6 +680,7 @@ void dbg_dump_lpt_info(struct ubifs_info *c)
663 int i; 680 int i;
664 681
665 spin_lock(&dbg_lock); 682 spin_lock(&dbg_lock);
683 printk(KERN_DEBUG "(pid %d) dumping LPT information\n", current->pid);
666 printk(KERN_DEBUG "\tlpt_sz: %lld\n", c->lpt_sz); 684 printk(KERN_DEBUG "\tlpt_sz: %lld\n", c->lpt_sz);
667 printk(KERN_DEBUG "\tpnode_sz: %d\n", c->pnode_sz); 685 printk(KERN_DEBUG "\tpnode_sz: %d\n", c->pnode_sz);
668 printk(KERN_DEBUG "\tnnode_sz: %d\n", c->nnode_sz); 686 printk(KERN_DEBUG "\tnnode_sz: %d\n", c->nnode_sz);
@@ -684,7 +702,8 @@ void dbg_dump_lpt_info(struct ubifs_info *c)
684 printk(KERN_DEBUG "\tLPT root is at %d:%d\n", c->lpt_lnum, c->lpt_offs); 702 printk(KERN_DEBUG "\tLPT root is at %d:%d\n", c->lpt_lnum, c->lpt_offs);
685 printk(KERN_DEBUG "\tLPT head is at %d:%d\n", 703 printk(KERN_DEBUG "\tLPT head is at %d:%d\n",
686 c->nhead_lnum, c->nhead_offs); 704 c->nhead_lnum, c->nhead_offs);
687 printk(KERN_DEBUG "\tLPT ltab is at %d:%d\n", c->ltab_lnum, c->ltab_offs); 705 printk(KERN_DEBUG "\tLPT ltab is at %d:%d\n",
706 c->ltab_lnum, c->ltab_offs);
688 if (c->big_lpt) 707 if (c->big_lpt)
689 printk(KERN_DEBUG "\tLPT lsave is at %d:%d\n", 708 printk(KERN_DEBUG "\tLPT lsave is at %d:%d\n",
690 c->lsave_lnum, c->lsave_offs); 709 c->lsave_lnum, c->lsave_offs);
@@ -703,9 +722,9 @@ void dbg_dump_leb(const struct ubifs_info *c, int lnum)
703 if (dbg_failure_mode) 722 if (dbg_failure_mode)
704 return; 723 return;
705 724
706 printk(KERN_DEBUG "(pid %d) Dumping LEB %d\n", current->pid, lnum); 725 printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n",
707 726 current->pid, lnum);
708 sleb = ubifs_scan(c, lnum, 0, c->dbg_buf); 727 sleb = ubifs_scan(c, lnum, 0, c->dbg->buf);
709 if (IS_ERR(sleb)) { 728 if (IS_ERR(sleb)) {
710 ubifs_err("scan error %d", (int)PTR_ERR(sleb)); 729 ubifs_err("scan error %d", (int)PTR_ERR(sleb));
711 return; 730 return;
@@ -721,6 +740,8 @@ void dbg_dump_leb(const struct ubifs_info *c, int lnum)
721 dbg_dump_node(c, snod->node); 740 dbg_dump_node(c, snod->node);
722 } 741 }
723 742
743 printk(KERN_DEBUG "(pid %d) finish dumping LEB %d\n",
744 current->pid, lnum);
724 ubifs_scan_destroy(sleb); 745 ubifs_scan_destroy(sleb);
725 return; 746 return;
726} 747}
@@ -768,7 +789,7 @@ void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat)
768{ 789{
769 int i; 790 int i;
770 791
771 printk(KERN_DEBUG "(pid %d) Dumping heap cat %d (%d elements)\n", 792 printk(KERN_DEBUG "(pid %d) start dumping heap cat %d (%d elements)\n",
772 current->pid, cat, heap->cnt); 793 current->pid, cat, heap->cnt);
773 for (i = 0; i < heap->cnt; i++) { 794 for (i = 0; i < heap->cnt; i++) {
774 struct ubifs_lprops *lprops = heap->arr[i]; 795 struct ubifs_lprops *lprops = heap->arr[i];
@@ -777,6 +798,7 @@ void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat)
777 "flags %d\n", i, lprops->lnum, lprops->hpos, 798 "flags %d\n", i, lprops->lnum, lprops->hpos,
778 lprops->free, lprops->dirty, lprops->flags); 799 lprops->free, lprops->dirty, lprops->flags);
779 } 800 }
801 printk(KERN_DEBUG "(pid %d) finish dumping heap\n", current->pid);
780} 802}
781 803
782void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, 804void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
@@ -784,7 +806,7 @@ void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
784{ 806{
785 int i; 807 int i;
786 808
787 printk(KERN_DEBUG "(pid %d) Dumping pnode:\n", current->pid); 809 printk(KERN_DEBUG "(pid %d) dumping pnode:\n", current->pid);
788 printk(KERN_DEBUG "\taddress %zx parent %zx cnext %zx\n", 810 printk(KERN_DEBUG "\taddress %zx parent %zx cnext %zx\n",
789 (size_t)pnode, (size_t)parent, (size_t)pnode->cnext); 811 (size_t)pnode, (size_t)parent, (size_t)pnode->cnext);
790 printk(KERN_DEBUG "\tflags %lu iip %d level %d num %d\n", 812 printk(KERN_DEBUG "\tflags %lu iip %d level %d num %d\n",
@@ -803,7 +825,7 @@ void dbg_dump_tnc(struct ubifs_info *c)
803 int level; 825 int level;
804 826
805 printk(KERN_DEBUG "\n"); 827 printk(KERN_DEBUG "\n");
806 printk(KERN_DEBUG "(pid %d) Dumping the TNC tree\n", current->pid); 828 printk(KERN_DEBUG "(pid %d) start dumping TNC tree\n", current->pid);
807 znode = ubifs_tnc_levelorder_next(c->zroot.znode, NULL); 829 znode = ubifs_tnc_levelorder_next(c->zroot.znode, NULL);
808 level = znode->level; 830 level = znode->level;
809 printk(KERN_DEBUG "== Level %d ==\n", level); 831 printk(KERN_DEBUG "== Level %d ==\n", level);
@@ -815,8 +837,7 @@ void dbg_dump_tnc(struct ubifs_info *c)
815 dbg_dump_znode(c, znode); 837 dbg_dump_znode(c, znode);
816 znode = ubifs_tnc_levelorder_next(c->zroot.znode, znode); 838 znode = ubifs_tnc_levelorder_next(c->zroot.znode, znode);
817 } 839 }
818 840 printk(KERN_DEBUG "(pid %d) finish dumping TNC tree\n", current->pid);
819 printk(KERN_DEBUG "\n");
820} 841}
821 842
822static int dump_znode(struct ubifs_info *c, struct ubifs_znode *znode, 843static int dump_znode(struct ubifs_info *c, struct ubifs_znode *znode,
@@ -839,6 +860,65 @@ void dbg_dump_index(struct ubifs_info *c)
839} 860}
840 861
841/** 862/**
863 * dbg_save_space_info - save information about flash space.
864 * @c: UBIFS file-system description object
865 *
866 * This function saves information about UBIFS free space, dirty space, etc, in
867 * order to check it later.
868 */
869void dbg_save_space_info(struct ubifs_info *c)
870{
871 struct ubifs_debug_info *d = c->dbg;
872
873 ubifs_get_lp_stats(c, &d->saved_lst);
874
875 spin_lock(&c->space_lock);
876 d->saved_free = ubifs_get_free_space_nolock(c);
877 spin_unlock(&c->space_lock);
878}
879
880/**
881 * dbg_check_space_info - check flash space information.
882 * @c: UBIFS file-system description object
883 *
884 * This function compares current flash space information with the information
885 * which was saved when the 'dbg_save_space_info()' function was called.
886 * Returns zero if the information has not changed, and %-EINVAL it it has
887 * changed.
888 */
889int dbg_check_space_info(struct ubifs_info *c)
890{
891 struct ubifs_debug_info *d = c->dbg;
892 struct ubifs_lp_stats lst;
893 long long avail, free;
894
895 spin_lock(&c->space_lock);
896 avail = ubifs_calc_available(c, c->min_idx_lebs);
897 spin_unlock(&c->space_lock);
898 free = ubifs_get_free_space(c);
899
900 if (free != d->saved_free) {
901 ubifs_err("free space changed from %lld to %lld",
902 d->saved_free, free);
903 goto out;
904 }
905
906 return 0;
907
908out:
909 ubifs_msg("saved lprops statistics dump");
910 dbg_dump_lstats(&d->saved_lst);
911 ubifs_get_lp_stats(c, &lst);
912 ubifs_msg("current lprops statistics dump");
913 dbg_dump_lstats(&d->saved_lst);
914 spin_lock(&c->space_lock);
915 dbg_dump_budg(c);
916 spin_unlock(&c->space_lock);
917 dump_stack();
918 return -EINVAL;
919}
920
921/**
842 * dbg_check_synced_i_size - check synchronized inode size. 922 * dbg_check_synced_i_size - check synchronized inode size.
843 * @inode: inode to check 923 * @inode: inode to check
844 * 924 *
@@ -992,8 +1072,8 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
992 zbr1->offs, DBGKEY(&key)); 1072 zbr1->offs, DBGKEY(&key));
993 dbg_err("but it should have key %s according to tnc", 1073 dbg_err("but it should have key %s according to tnc",
994 DBGKEY(&zbr1->key)); 1074 DBGKEY(&zbr1->key));
995 dbg_dump_node(c, dent1); 1075 dbg_dump_node(c, dent1);
996 goto out_free; 1076 goto out_free;
997 } 1077 }
998 1078
999 key_read(c, &dent2->key, &key); 1079 key_read(c, &dent2->key, &key);
@@ -1002,8 +1082,8 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
1002 zbr1->offs, DBGKEY(&key)); 1082 zbr1->offs, DBGKEY(&key));
1003 dbg_err("but it should have key %s according to tnc", 1083 dbg_err("but it should have key %s according to tnc",
1004 DBGKEY(&zbr2->key)); 1084 DBGKEY(&zbr2->key));
1005 dbg_dump_node(c, dent2); 1085 dbg_dump_node(c, dent2);
1006 goto out_free; 1086 goto out_free;
1007 } 1087 }
1008 1088
1009 nlen1 = le16_to_cpu(dent1->nlen); 1089 nlen1 = le16_to_cpu(dent1->nlen);
@@ -1020,9 +1100,9 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
1020 dbg_err("bad order of colliding key %s", 1100 dbg_err("bad order of colliding key %s",
1021 DBGKEY(&key)); 1101 DBGKEY(&key));
1022 1102
1023 dbg_msg("first node at %d:%d\n", zbr1->lnum, zbr1->offs); 1103 ubifs_msg("first node at %d:%d\n", zbr1->lnum, zbr1->offs);
1024 dbg_dump_node(c, dent1); 1104 dbg_dump_node(c, dent1);
1025 dbg_msg("second node at %d:%d\n", zbr2->lnum, zbr2->offs); 1105 ubifs_msg("second node at %d:%d\n", zbr2->lnum, zbr2->offs);
1026 dbg_dump_node(c, dent2); 1106 dbg_dump_node(c, dent2);
1027 1107
1028out_free: 1108out_free:
@@ -1327,7 +1407,7 @@ int dbg_check_tnc(struct ubifs_info *c, int extra)
1327 * @c: UBIFS file-system description object 1407 * @c: UBIFS file-system description object
1328 * @leaf_cb: called for each leaf node 1408 * @leaf_cb: called for each leaf node
1329 * @znode_cb: called for each indexing node 1409 * @znode_cb: called for each indexing node
1330 * @priv: private date which is passed to callbacks 1410 * @priv: private data which is passed to callbacks
1331 * 1411 *
1332 * This function walks the UBIFS index and calls the @leaf_cb for each leaf 1412 * This function walks the UBIFS index and calls the @leaf_cb for each leaf
1333 * node and @znode_cb for each indexing node. Returns zero in case of success 1413 * node and @znode_cb for each indexing node. Returns zero in case of success
@@ -2097,13 +2177,13 @@ static int simple_rand(void)
2097 return (next >> 16) & 32767; 2177 return (next >> 16) & 32767;
2098} 2178}
2099 2179
2100void dbg_failure_mode_registration(struct ubifs_info *c) 2180static void failure_mode_init(struct ubifs_info *c)
2101{ 2181{
2102 struct failure_mode_info *fmi; 2182 struct failure_mode_info *fmi;
2103 2183
2104 fmi = kmalloc(sizeof(struct failure_mode_info), GFP_NOFS); 2184 fmi = kmalloc(sizeof(struct failure_mode_info), GFP_NOFS);
2105 if (!fmi) { 2185 if (!fmi) {
2106 dbg_err("Failed to register failure mode - no memory"); 2186 ubifs_err("Failed to register failure mode - no memory");
2107 return; 2187 return;
2108 } 2188 }
2109 fmi->c = c; 2189 fmi->c = c;
@@ -2112,7 +2192,7 @@ void dbg_failure_mode_registration(struct ubifs_info *c)
2112 spin_unlock(&fmi_lock); 2192 spin_unlock(&fmi_lock);
2113} 2193}
2114 2194
2115void dbg_failure_mode_deregistration(struct ubifs_info *c) 2195static void failure_mode_exit(struct ubifs_info *c)
2116{ 2196{
2117 struct failure_mode_info *fmi, *tmp; 2197 struct failure_mode_info *fmi, *tmp;
2118 2198
@@ -2146,42 +2226,44 @@ static int in_failure_mode(struct ubi_volume_desc *desc)
2146 struct ubifs_info *c = dbg_find_info(desc); 2226 struct ubifs_info *c = dbg_find_info(desc);
2147 2227
2148 if (c && dbg_failure_mode) 2228 if (c && dbg_failure_mode)
2149 return c->failure_mode; 2229 return c->dbg->failure_mode;
2150 return 0; 2230 return 0;
2151} 2231}
2152 2232
2153static int do_fail(struct ubi_volume_desc *desc, int lnum, int write) 2233static int do_fail(struct ubi_volume_desc *desc, int lnum, int write)
2154{ 2234{
2155 struct ubifs_info *c = dbg_find_info(desc); 2235 struct ubifs_info *c = dbg_find_info(desc);
2236 struct ubifs_debug_info *d;
2156 2237
2157 if (!c || !dbg_failure_mode) 2238 if (!c || !dbg_failure_mode)
2158 return 0; 2239 return 0;
2159 if (c->failure_mode) 2240 d = c->dbg;
2241 if (d->failure_mode)
2160 return 1; 2242 return 1;
2161 if (!c->fail_cnt) { 2243 if (!d->fail_cnt) {
2162 /* First call - decide delay to failure */ 2244 /* First call - decide delay to failure */
2163 if (chance(1, 2)) { 2245 if (chance(1, 2)) {
2164 unsigned int delay = 1 << (simple_rand() >> 11); 2246 unsigned int delay = 1 << (simple_rand() >> 11);
2165 2247
2166 if (chance(1, 2)) { 2248 if (chance(1, 2)) {
2167 c->fail_delay = 1; 2249 d->fail_delay = 1;
2168 c->fail_timeout = jiffies + 2250 d->fail_timeout = jiffies +
2169 msecs_to_jiffies(delay); 2251 msecs_to_jiffies(delay);
2170 dbg_rcvry("failing after %ums", delay); 2252 dbg_rcvry("failing after %ums", delay);
2171 } else { 2253 } else {
2172 c->fail_delay = 2; 2254 d->fail_delay = 2;
2173 c->fail_cnt_max = delay; 2255 d->fail_cnt_max = delay;
2174 dbg_rcvry("failing after %u calls", delay); 2256 dbg_rcvry("failing after %u calls", delay);
2175 } 2257 }
2176 } 2258 }
2177 c->fail_cnt += 1; 2259 d->fail_cnt += 1;
2178 } 2260 }
2179 /* Determine if failure delay has expired */ 2261 /* Determine if failure delay has expired */
2180 if (c->fail_delay == 1) { 2262 if (d->fail_delay == 1) {
2181 if (time_before(jiffies, c->fail_timeout)) 2263 if (time_before(jiffies, d->fail_timeout))
2182 return 0; 2264 return 0;
2183 } else if (c->fail_delay == 2) 2265 } else if (d->fail_delay == 2)
2184 if (c->fail_cnt++ < c->fail_cnt_max) 2266 if (d->fail_cnt++ < d->fail_cnt_max)
2185 return 0; 2267 return 0;
2186 if (lnum == UBIFS_SB_LNUM) { 2268 if (lnum == UBIFS_SB_LNUM) {
2187 if (write) { 2269 if (write) {
@@ -2239,7 +2321,7 @@ static int do_fail(struct ubi_volume_desc *desc, int lnum, int write)
2239 dbg_rcvry("failing in bud LEB %d commit not running", lnum); 2321 dbg_rcvry("failing in bud LEB %d commit not running", lnum);
2240 } 2322 }
2241 ubifs_err("*** SETTING FAILURE MODE ON (LEB %d) ***", lnum); 2323 ubifs_err("*** SETTING FAILURE MODE ON (LEB %d) ***", lnum);
2242 c->failure_mode = 1; 2324 d->failure_mode = 1;
2243 dump_stack(); 2325 dump_stack();
2244 return 1; 2326 return 1;
2245} 2327}
@@ -2344,4 +2426,177 @@ int dbg_leb_map(struct ubi_volume_desc *desc, int lnum, int dtype)
2344 return 0; 2426 return 0;
2345} 2427}
2346 2428
2429/**
2430 * ubifs_debugging_init - initialize UBIFS debugging.
2431 * @c: UBIFS file-system description object
2432 *
2433 * This function initializes debugging-related data for the file system.
2434 * Returns zero in case of success and a negative error code in case of
2435 * failure.
2436 */
2437int ubifs_debugging_init(struct ubifs_info *c)
2438{
2439 c->dbg = kzalloc(sizeof(struct ubifs_debug_info), GFP_KERNEL);
2440 if (!c->dbg)
2441 return -ENOMEM;
2442
2443 c->dbg->buf = vmalloc(c->leb_size);
2444 if (!c->dbg->buf)
2445 goto out;
2446
2447 failure_mode_init(c);
2448 return 0;
2449
2450out:
2451 kfree(c->dbg);
2452 return -ENOMEM;
2453}
2454
2455/**
2456 * ubifs_debugging_exit - free debugging data.
2457 * @c: UBIFS file-system description object
2458 */
2459void ubifs_debugging_exit(struct ubifs_info *c)
2460{
2461 failure_mode_exit(c);
2462 vfree(c->dbg->buf);
2463 kfree(c->dbg);
2464}
2465
2466/*
2467 * Root directory for UBIFS stuff in debugfs. Contains sub-directories which
2468 * contain the stuff specific to particular file-system mounts.
2469 */
2470static struct dentry *dfs_rootdir;
2471
2472/**
2473 * dbg_debugfs_init - initialize debugfs file-system.
2474 *
2475 * UBIFS uses debugfs file-system to expose various debugging knobs to
2476 * user-space. This function creates "ubifs" directory in the debugfs
2477 * file-system. Returns zero in case of success and a negative error code in
2478 * case of failure.
2479 */
2480int dbg_debugfs_init(void)
2481{
2482 dfs_rootdir = debugfs_create_dir("ubifs", NULL);
2483 if (IS_ERR(dfs_rootdir)) {
2484 int err = PTR_ERR(dfs_rootdir);
2485 ubifs_err("cannot create \"ubifs\" debugfs directory, "
2486 "error %d\n", err);
2487 return err;
2488 }
2489
2490 return 0;
2491}
2492
2493/**
2494 * dbg_debugfs_exit - remove the "ubifs" directory from debugfs file-system.
2495 */
2496void dbg_debugfs_exit(void)
2497{
2498 debugfs_remove(dfs_rootdir);
2499}
2500
2501static int open_debugfs_file(struct inode *inode, struct file *file)
2502{
2503 file->private_data = inode->i_private;
2504 return 0;
2505}
2506
2507static ssize_t write_debugfs_file(struct file *file, const char __user *buf,
2508 size_t count, loff_t *ppos)
2509{
2510 struct ubifs_info *c = file->private_data;
2511 struct ubifs_debug_info *d = c->dbg;
2512
2513 if (file->f_path.dentry == d->dfs_dump_lprops)
2514 dbg_dump_lprops(c);
2515 else if (file->f_path.dentry == d->dfs_dump_budg) {
2516 spin_lock(&c->space_lock);
2517 dbg_dump_budg(c);
2518 spin_unlock(&c->space_lock);
2519 } else if (file->f_path.dentry == d->dfs_dump_tnc) {
2520 mutex_lock(&c->tnc_mutex);
2521 dbg_dump_tnc(c);
2522 mutex_unlock(&c->tnc_mutex);
2523 } else
2524 return -EINVAL;
2525
2526 *ppos += count;
2527 return count;
2528}
2529
2530static const struct file_operations dfs_fops = {
2531 .open = open_debugfs_file,
2532 .write = write_debugfs_file,
2533 .owner = THIS_MODULE,
2534};
2535
2536/**
2537 * dbg_debugfs_init_fs - initialize debugfs for UBIFS instance.
2538 * @c: UBIFS file-system description object
2539 *
2540 * This function creates all debugfs files for this instance of UBIFS. Returns
2541 * zero in case of success and a negative error code in case of failure.
2542 *
2543 * Note, the only reason we have not merged this function with the
2544 * 'ubifs_debugging_init()' function is because it is better to initialize
2545 * debugfs interfaces at the very end of the mount process, and remove them at
2546 * the very beginning of the mount process.
2547 */
2548int dbg_debugfs_init_fs(struct ubifs_info *c)
2549{
2550 int err;
2551 const char *fname;
2552 struct dentry *dent;
2553 struct ubifs_debug_info *d = c->dbg;
2554
2555 sprintf(d->dfs_dir_name, "ubi%d_%d", c->vi.ubi_num, c->vi.vol_id);
2556 d->dfs_dir = debugfs_create_dir(d->dfs_dir_name, dfs_rootdir);
2557 if (IS_ERR(d->dfs_dir)) {
2558 err = PTR_ERR(d->dfs_dir);
2559 ubifs_err("cannot create \"%s\" debugfs directory, error %d\n",
2560 d->dfs_dir_name, err);
2561 goto out;
2562 }
2563
2564 fname = "dump_lprops";
2565 dent = debugfs_create_file(fname, S_IWUGO, d->dfs_dir, c, &dfs_fops);
2566 if (IS_ERR(dent))
2567 goto out_remove;
2568 d->dfs_dump_lprops = dent;
2569
2570 fname = "dump_budg";
2571 dent = debugfs_create_file(fname, S_IWUGO, d->dfs_dir, c, &dfs_fops);
2572 if (IS_ERR(dent))
2573 goto out_remove;
2574 d->dfs_dump_budg = dent;
2575
2576 fname = "dump_tnc";
2577 dent = debugfs_create_file(fname, S_IWUGO, d->dfs_dir, c, &dfs_fops);
2578 if (IS_ERR(dent))
2579 goto out_remove;
2580 d->dfs_dump_tnc = dent;
2581
2582 return 0;
2583
2584out_remove:
2585 err = PTR_ERR(dent);
2586 ubifs_err("cannot create \"%s\" debugfs directory, error %d\n",
2587 fname, err);
2588 debugfs_remove_recursive(d->dfs_dir);
2589out:
2590 return err;
2591}
2592
2593/**
2594 * dbg_debugfs_exit_fs - remove all debugfs files.
2595 * @c: UBIFS file-system description object
2596 */
2597void dbg_debugfs_exit_fs(struct ubifs_info *c)
2598{
2599 debugfs_remove_recursive(c->dbg->dfs_dir);
2600}
2601
2347#endif /* CONFIG_UBIFS_FS_DEBUG */ 2602#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index 33d6b95071e4..c1cd73b2e06e 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -25,7 +25,61 @@
25 25
26#ifdef CONFIG_UBIFS_FS_DEBUG 26#ifdef CONFIG_UBIFS_FS_DEBUG
27 27
28#define UBIFS_DBG(op) op 28/**
29 * ubifs_debug_info - per-FS debugging information.
30 * @buf: a buffer of LEB size, used for various purposes
31 * @old_zroot: old index root - used by 'dbg_check_old_index()'
32 * @old_zroot_level: old index root level - used by 'dbg_check_old_index()'
33 * @old_zroot_sqnum: old index root sqnum - used by 'dbg_check_old_index()'
34 * @failure_mode: failure mode for recovery testing
35 * @fail_delay: 0=>don't delay, 1=>delay a time, 2=>delay a number of calls
36 * @fail_timeout: time in jiffies when delay of failure mode expires
37 * @fail_cnt: current number of calls to failure mode I/O functions
38 * @fail_cnt_max: number of calls by which to delay failure mode
39 * @chk_lpt_sz: used by LPT tree size checker
40 * @chk_lpt_sz2: used by LPT tree size checker
41 * @chk_lpt_wastage: used by LPT tree size checker
42 * @chk_lpt_lebs: used by LPT tree size checker
43 * @new_nhead_offs: used by LPT tree size checker
44 * @new_ihead_lnum: used by debugging to check @c->ihead_lnum
45 * @new_ihead_offs: used by debugging to check @c->ihead_offs
46 *
47 * @saved_lst: saved lprops statistics (used by 'dbg_save_space_info()')
48 * @saved_free: saved free space (used by 'dbg_save_space_info()')
49 *
50 * dfs_dir_name: name of debugfs directory containing this file-system's files
51 * dfs_dir: direntry object of the file-system debugfs directory
52 * dfs_dump_lprops: "dump lprops" debugfs knob
53 * dfs_dump_budg: "dump budgeting information" debugfs knob
54 * dfs_dump_tnc: "dump TNC" debugfs knob
55 */
56struct ubifs_debug_info {
57 void *buf;
58 struct ubifs_zbranch old_zroot;
59 int old_zroot_level;
60 unsigned long long old_zroot_sqnum;
61 int failure_mode;
62 int fail_delay;
63 unsigned long fail_timeout;
64 unsigned int fail_cnt;
65 unsigned int fail_cnt_max;
66 long long chk_lpt_sz;
67 long long chk_lpt_sz2;
68 long long chk_lpt_wastage;
69 int chk_lpt_lebs;
70 int new_nhead_offs;
71 int new_ihead_lnum;
72 int new_ihead_offs;
73
74 struct ubifs_lp_stats saved_lst;
75 long long saved_free;
76
77 char dfs_dir_name[100];
78 struct dentry *dfs_dir;
79 struct dentry *dfs_dump_lprops;
80 struct dentry *dfs_dump_budg;
81 struct dentry *dfs_dump_tnc;
82};
29 83
30#define ubifs_assert(expr) do { \ 84#define ubifs_assert(expr) do { \
31 if (unlikely(!(expr))) { \ 85 if (unlikely(!(expr))) { \
@@ -211,14 +265,18 @@ extern unsigned int ubifs_msg_flags;
211extern unsigned int ubifs_chk_flags; 265extern unsigned int ubifs_chk_flags;
212extern unsigned int ubifs_tst_flags; 266extern unsigned int ubifs_tst_flags;
213 267
214/* Dump functions */ 268int ubifs_debugging_init(struct ubifs_info *c);
269void ubifs_debugging_exit(struct ubifs_info *c);
215 270
271/* Dump functions */
216const char *dbg_ntype(int type); 272const char *dbg_ntype(int type);
217const char *dbg_cstate(int cmt_state); 273const char *dbg_cstate(int cmt_state);
218const char *dbg_get_key_dump(const struct ubifs_info *c, 274const char *dbg_get_key_dump(const struct ubifs_info *c,
219 const union ubifs_key *key); 275 const union ubifs_key *key);
220void dbg_dump_inode(const struct ubifs_info *c, const struct inode *inode); 276void dbg_dump_inode(const struct ubifs_info *c, const struct inode *inode);
221void dbg_dump_node(const struct ubifs_info *c, const void *node); 277void dbg_dump_node(const struct ubifs_info *c, const void *node);
278void dbg_dump_lpt_node(const struct ubifs_info *c, void *node, int lnum,
279 int offs);
222void dbg_dump_budget_req(const struct ubifs_budget_req *req); 280void dbg_dump_budget_req(const struct ubifs_budget_req *req);
223void dbg_dump_lstats(const struct ubifs_lp_stats *lst); 281void dbg_dump_lstats(const struct ubifs_lp_stats *lst);
224void dbg_dump_budg(struct ubifs_info *c); 282void dbg_dump_budg(struct ubifs_info *c);
@@ -233,9 +291,9 @@ void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
233 struct ubifs_nnode *parent, int iip); 291 struct ubifs_nnode *parent, int iip);
234void dbg_dump_tnc(struct ubifs_info *c); 292void dbg_dump_tnc(struct ubifs_info *c);
235void dbg_dump_index(struct ubifs_info *c); 293void dbg_dump_index(struct ubifs_info *c);
294void dbg_dump_lpt_lebs(const struct ubifs_info *c);
236 295
237/* Checking helper functions */ 296/* Checking helper functions */
238
239typedef int (*dbg_leaf_callback)(struct ubifs_info *c, 297typedef int (*dbg_leaf_callback)(struct ubifs_info *c,
240 struct ubifs_zbranch *zbr, void *priv); 298 struct ubifs_zbranch *zbr, void *priv);
241typedef int (*dbg_znode_callback)(struct ubifs_info *c, 299typedef int (*dbg_znode_callback)(struct ubifs_info *c,
@@ -244,7 +302,8 @@ int dbg_walk_index(struct ubifs_info *c, dbg_leaf_callback leaf_cb,
244 dbg_znode_callback znode_cb, void *priv); 302 dbg_znode_callback znode_cb, void *priv);
245 303
246/* Checking functions */ 304/* Checking functions */
247 305void dbg_save_space_info(struct ubifs_info *c);
306int dbg_check_space_info(struct ubifs_info *c);
248int dbg_check_lprops(struct ubifs_info *c); 307int dbg_check_lprops(struct ubifs_info *c);
249int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot); 308int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot);
250int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot); 309int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot);
@@ -274,9 +333,6 @@ int dbg_force_in_the_gaps(void);
274 333
275#define dbg_failure_mode (ubifs_tst_flags & UBIFS_TST_RCVRY) 334#define dbg_failure_mode (ubifs_tst_flags & UBIFS_TST_RCVRY)
276 335
277void dbg_failure_mode_registration(struct ubifs_info *c);
278void dbg_failure_mode_deregistration(struct ubifs_info *c);
279
280#ifndef UBIFS_DBG_PRESERVE_UBI 336#ifndef UBIFS_DBG_PRESERVE_UBI
281 337
282#define ubi_leb_read dbg_leb_read 338#define ubi_leb_read dbg_leb_read
@@ -318,9 +374,13 @@ static inline int dbg_change(struct ubi_volume_desc *desc, int lnum,
318 return dbg_leb_change(desc, lnum, buf, len, UBI_UNKNOWN); 374 return dbg_leb_change(desc, lnum, buf, len, UBI_UNKNOWN);
319} 375}
320 376
321#else /* !CONFIG_UBIFS_FS_DEBUG */ 377/* Debugfs-related stuff */
378int dbg_debugfs_init(void);
379void dbg_debugfs_exit(void);
380int dbg_debugfs_init_fs(struct ubifs_info *c);
381void dbg_debugfs_exit_fs(struct ubifs_info *c);
322 382
323#define UBIFS_DBG(op) 383#else /* !CONFIG_UBIFS_FS_DEBUG */
324 384
325/* Use "if (0)" to make compiler check arguments even if debugging is off */ 385/* Use "if (0)" to make compiler check arguments even if debugging is off */
326#define ubifs_assert(expr) do { \ 386#define ubifs_assert(expr) do { \
@@ -360,26 +420,33 @@ static inline int dbg_change(struct ubi_volume_desc *desc, int lnum,
360#define DBGKEY(key) ((char *)(key)) 420#define DBGKEY(key) ((char *)(key))
361#define DBGKEY1(key) ((char *)(key)) 421#define DBGKEY1(key) ((char *)(key))
362 422
363#define dbg_ntype(type) "" 423#define ubifs_debugging_init(c) 0
364#define dbg_cstate(cmt_state) "" 424#define ubifs_debugging_exit(c) ({})
365#define dbg_get_key_dump(c, key) ({}) 425
366#define dbg_dump_inode(c, inode) ({}) 426#define dbg_ntype(type) ""
367#define dbg_dump_node(c, node) ({}) 427#define dbg_cstate(cmt_state) ""
368#define dbg_dump_budget_req(req) ({}) 428#define dbg_get_key_dump(c, key) ({})
369#define dbg_dump_lstats(lst) ({}) 429#define dbg_dump_inode(c, inode) ({})
370#define dbg_dump_budg(c) ({}) 430#define dbg_dump_node(c, node) ({})
371#define dbg_dump_lprop(c, lp) ({}) 431#define dbg_dump_lpt_node(c, node, lnum, offs) ({})
372#define dbg_dump_lprops(c) ({}) 432#define dbg_dump_budget_req(req) ({})
373#define dbg_dump_lpt_info(c) ({}) 433#define dbg_dump_lstats(lst) ({})
374#define dbg_dump_leb(c, lnum) ({}) 434#define dbg_dump_budg(c) ({})
375#define dbg_dump_znode(c, znode) ({}) 435#define dbg_dump_lprop(c, lp) ({})
376#define dbg_dump_heap(c, heap, cat) ({}) 436#define dbg_dump_lprops(c) ({})
377#define dbg_dump_pnode(c, pnode, parent, iip) ({}) 437#define dbg_dump_lpt_info(c) ({})
378#define dbg_dump_tnc(c) ({}) 438#define dbg_dump_leb(c, lnum) ({})
379#define dbg_dump_index(c) ({}) 439#define dbg_dump_znode(c, znode) ({})
440#define dbg_dump_heap(c, heap, cat) ({})
441#define dbg_dump_pnode(c, pnode, parent, iip) ({})
442#define dbg_dump_tnc(c) ({})
443#define dbg_dump_index(c) ({})
444#define dbg_dump_lpt_lebs(c) ({})
380 445
381#define dbg_walk_index(c, leaf_cb, znode_cb, priv) 0 446#define dbg_walk_index(c, leaf_cb, znode_cb, priv) 0
382#define dbg_old_index_check_init(c, zroot) 0 447#define dbg_old_index_check_init(c, zroot) 0
448#define dbg_save_space_info(c) ({})
449#define dbg_check_space_info(c) 0
383#define dbg_check_old_index(c, zroot) 0 450#define dbg_check_old_index(c, zroot) 0
384#define dbg_check_cats(c) 0 451#define dbg_check_cats(c) 0
385#define dbg_check_ltab(c) 0 452#define dbg_check_ltab(c) 0
@@ -396,9 +463,11 @@ static inline int dbg_change(struct ubi_volume_desc *desc, int lnum,
396#define dbg_force_in_the_gaps_enabled 0 463#define dbg_force_in_the_gaps_enabled 0
397#define dbg_force_in_the_gaps() 0 464#define dbg_force_in_the_gaps() 0
398#define dbg_failure_mode 0 465#define dbg_failure_mode 0
399#define dbg_failure_mode_registration(c) ({})
400#define dbg_failure_mode_deregistration(c) ({})
401 466
402#endif /* !CONFIG_UBIFS_FS_DEBUG */ 467#define dbg_debugfs_init() 0
468#define dbg_debugfs_exit()
469#define dbg_debugfs_init_fs(c) 0
470#define dbg_debugfs_exit_fs(c) 0
403 471
472#endif /* !CONFIG_UBIFS_FS_DEBUG */
404#endif /* !__UBIFS_DEBUG_H__ */ 473#endif /* !__UBIFS_DEBUG_H__ */
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index f448ab1f9c38..f55d523c52bb 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -482,30 +482,29 @@ static int ubifs_dir_release(struct inode *dir, struct file *file)
482} 482}
483 483
484/** 484/**
485 * lock_2_inodes - lock two UBIFS inodes. 485 * lock_2_inodes - a wrapper for locking two UBIFS inodes.
486 * @inode1: first inode 486 * @inode1: first inode
487 * @inode2: second inode 487 * @inode2: second inode
488 *
489 * We do not implement any tricks to guarantee strict lock ordering, because
490 * VFS has already done it for us on the @i_mutex. So this is just a simple
491 * wrapper function.
488 */ 492 */
489static void lock_2_inodes(struct inode *inode1, struct inode *inode2) 493static void lock_2_inodes(struct inode *inode1, struct inode *inode2)
490{ 494{
491 if (inode1->i_ino < inode2->i_ino) { 495 mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1);
492 mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_2); 496 mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2);
493 mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_3);
494 } else {
495 mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2);
496 mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_3);
497 }
498} 497}
499 498
500/** 499/**
501 * unlock_2_inodes - unlock two UBIFS inodes inodes. 500 * unlock_2_inodes - a wrapper for unlocking two UBIFS inodes.
502 * @inode1: first inode 501 * @inode1: first inode
503 * @inode2: second inode 502 * @inode2: second inode
504 */ 503 */
505static void unlock_2_inodes(struct inode *inode1, struct inode *inode2) 504static void unlock_2_inodes(struct inode *inode1, struct inode *inode2)
506{ 505{
507 mutex_unlock(&ubifs_inode(inode1)->ui_mutex);
508 mutex_unlock(&ubifs_inode(inode2)->ui_mutex); 506 mutex_unlock(&ubifs_inode(inode2)->ui_mutex);
507 mutex_unlock(&ubifs_inode(inode1)->ui_mutex);
509} 508}
510 509
511static int ubifs_link(struct dentry *old_dentry, struct inode *dir, 510static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
@@ -527,6 +526,8 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
527 dbg_gen("dent '%.*s' to ino %lu (nlink %d) in dir ino %lu", 526 dbg_gen("dent '%.*s' to ino %lu (nlink %d) in dir ino %lu",
528 dentry->d_name.len, dentry->d_name.name, inode->i_ino, 527 dentry->d_name.len, dentry->d_name.name, inode->i_ino,
529 inode->i_nlink, dir->i_ino); 528 inode->i_nlink, dir->i_ino);
529 ubifs_assert(mutex_is_locked(&dir->i_mutex));
530 ubifs_assert(mutex_is_locked(&inode->i_mutex));
530 err = dbg_check_synced_i_size(inode); 531 err = dbg_check_synced_i_size(inode);
531 if (err) 532 if (err)
532 return err; 533 return err;
@@ -580,6 +581,8 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
580 dbg_gen("dent '%.*s' from ino %lu (nlink %d) in dir ino %lu", 581 dbg_gen("dent '%.*s' from ino %lu (nlink %d) in dir ino %lu",
581 dentry->d_name.len, dentry->d_name.name, inode->i_ino, 582 dentry->d_name.len, dentry->d_name.name, inode->i_ino,
582 inode->i_nlink, dir->i_ino); 583 inode->i_nlink, dir->i_ino);
584 ubifs_assert(mutex_is_locked(&dir->i_mutex));
585 ubifs_assert(mutex_is_locked(&inode->i_mutex));
583 err = dbg_check_synced_i_size(inode); 586 err = dbg_check_synced_i_size(inode);
584 if (err) 587 if (err)
585 return err; 588 return err;
@@ -667,7 +670,8 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
667 670
668 dbg_gen("directory '%.*s', ino %lu in dir ino %lu", dentry->d_name.len, 671 dbg_gen("directory '%.*s', ino %lu in dir ino %lu", dentry->d_name.len,
669 dentry->d_name.name, inode->i_ino, dir->i_ino); 672 dentry->d_name.name, inode->i_ino, dir->i_ino);
670 673 ubifs_assert(mutex_is_locked(&dir->i_mutex));
674 ubifs_assert(mutex_is_locked(&inode->i_mutex));
671 err = check_dir_empty(c, dentry->d_inode); 675 err = check_dir_empty(c, dentry->d_inode);
672 if (err) 676 if (err)
673 return err; 677 return err;
@@ -922,59 +926,30 @@ out_budg:
922} 926}
923 927
924/** 928/**
925 * lock_3_inodes - lock three UBIFS inodes for rename. 929 * lock_3_inodes - a wrapper for locking three UBIFS inodes.
926 * @inode1: first inode 930 * @inode1: first inode
927 * @inode2: second inode 931 * @inode2: second inode
928 * @inode3: third inode 932 * @inode3: third inode
929 * 933 *
930 * For 'ubifs_rename()', @inode1 may be the same as @inode2 whereas @inode3 may 934 * This function is used for 'ubifs_rename()' and @inode1 may be the same as
931 * be null. 935 * @inode2 whereas @inode3 may be %NULL.
936 *
937 * We do not implement any tricks to guarantee strict lock ordering, because
938 * VFS has already done it for us on the @i_mutex. So this is just a simple
939 * wrapper function.
932 */ 940 */
933static void lock_3_inodes(struct inode *inode1, struct inode *inode2, 941static void lock_3_inodes(struct inode *inode1, struct inode *inode2,
934 struct inode *inode3) 942 struct inode *inode3)
935{ 943{
936 struct inode *i1, *i2, *i3; 944 mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1);
937 945 if (inode2 != inode1)
938 if (!inode3) { 946 mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2);
939 if (inode1 != inode2) { 947 if (inode3)
940 lock_2_inodes(inode1, inode2); 948 mutex_lock_nested(&ubifs_inode(inode3)->ui_mutex, WB_MUTEX_3);
941 return;
942 }
943 mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1);
944 return;
945 }
946
947 if (inode1 == inode2) {
948 lock_2_inodes(inode1, inode3);
949 return;
950 }
951
952 /* 3 different inodes */
953 if (inode1 < inode2) {
954 i3 = inode2;
955 if (inode1 < inode3) {
956 i1 = inode1;
957 i2 = inode3;
958 } else {
959 i1 = inode3;
960 i2 = inode1;
961 }
962 } else {
963 i3 = inode1;
964 if (inode2 < inode3) {
965 i1 = inode2;
966 i2 = inode3;
967 } else {
968 i1 = inode3;
969 i2 = inode2;
970 }
971 }
972 mutex_lock_nested(&ubifs_inode(i1)->ui_mutex, WB_MUTEX_1);
973 lock_2_inodes(i2, i3);
974} 949}
975 950
976/** 951/**
977 * unlock_3_inodes - unlock three UBIFS inodes for rename. 952 * unlock_3_inodes - a wrapper for unlocking three UBIFS inodes for rename.
978 * @inode1: first inode 953 * @inode1: first inode
979 * @inode2: second inode 954 * @inode2: second inode
980 * @inode3: third inode 955 * @inode3: third inode
@@ -982,11 +957,11 @@ static void lock_3_inodes(struct inode *inode1, struct inode *inode2,
982static void unlock_3_inodes(struct inode *inode1, struct inode *inode2, 957static void unlock_3_inodes(struct inode *inode1, struct inode *inode2,
983 struct inode *inode3) 958 struct inode *inode3)
984{ 959{
985 mutex_unlock(&ubifs_inode(inode1)->ui_mutex);
986 if (inode1 != inode2)
987 mutex_unlock(&ubifs_inode(inode2)->ui_mutex);
988 if (inode3) 960 if (inode3)
989 mutex_unlock(&ubifs_inode(inode3)->ui_mutex); 961 mutex_unlock(&ubifs_inode(inode3)->ui_mutex);
962 if (inode1 != inode2)
963 mutex_unlock(&ubifs_inode(inode2)->ui_mutex);
964 mutex_unlock(&ubifs_inode(inode1)->ui_mutex);
990} 965}
991 966
992static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry, 967static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
@@ -1020,6 +995,11 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
1020 "dir ino %lu", old_dentry->d_name.len, old_dentry->d_name.name, 995 "dir ino %lu", old_dentry->d_name.len, old_dentry->d_name.name,
1021 old_inode->i_ino, old_dir->i_ino, new_dentry->d_name.len, 996 old_inode->i_ino, old_dir->i_ino, new_dentry->d_name.len,
1022 new_dentry->d_name.name, new_dir->i_ino); 997 new_dentry->d_name.name, new_dir->i_ino);
998 ubifs_assert(mutex_is_locked(&old_dir->i_mutex));
999 ubifs_assert(mutex_is_locked(&new_dir->i_mutex));
1000 if (unlink)
1001 ubifs_assert(mutex_is_locked(&new_inode->i_mutex));
1002
1023 1003
1024 if (unlink && is_dir) { 1004 if (unlink && is_dir) {
1025 err = check_dir_empty(c, new_inode); 1005 err = check_dir_empty(c, new_inode);
@@ -1199,7 +1179,7 @@ int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
1199 return 0; 1179 return 0;
1200} 1180}
1201 1181
1202struct inode_operations ubifs_dir_inode_operations = { 1182const struct inode_operations ubifs_dir_inode_operations = {
1203 .lookup = ubifs_lookup, 1183 .lookup = ubifs_lookup,
1204 .create = ubifs_create, 1184 .create = ubifs_create,
1205 .link = ubifs_link, 1185 .link = ubifs_link,
@@ -1219,7 +1199,7 @@ struct inode_operations ubifs_dir_inode_operations = {
1219#endif 1199#endif
1220}; 1200};
1221 1201
1222struct file_operations ubifs_dir_operations = { 1202const struct file_operations ubifs_dir_operations = {
1223 .llseek = ubifs_dir_llseek, 1203 .llseek = ubifs_dir_llseek,
1224 .release = ubifs_dir_release, 1204 .release = ubifs_dir_release,
1225 .read = generic_read_dir, 1205 .read = generic_read_dir,
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 2624411d9758..93b6de51f261 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -72,8 +72,8 @@ static int read_block(struct inode *inode, void *addr, unsigned int block,
72 return err; 72 return err;
73 } 73 }
74 74
75 ubifs_assert(le64_to_cpu(dn->ch.sqnum) > ubifs_inode(inode)->creat_sqnum); 75 ubifs_assert(le64_to_cpu(dn->ch.sqnum) >
76 76 ubifs_inode(inode)->creat_sqnum);
77 len = le32_to_cpu(dn->size); 77 len = le32_to_cpu(dn->size);
78 if (len <= 0 || len > UBIFS_BLOCK_SIZE) 78 if (len <= 0 || len > UBIFS_BLOCK_SIZE)
79 goto dump; 79 goto dump;
@@ -219,7 +219,8 @@ static void release_existing_page_budget(struct ubifs_info *c)
219} 219}
220 220
221static int write_begin_slow(struct address_space *mapping, 221static int write_begin_slow(struct address_space *mapping,
222 loff_t pos, unsigned len, struct page **pagep) 222 loff_t pos, unsigned len, struct page **pagep,
223 unsigned flags)
223{ 224{
224 struct inode *inode = mapping->host; 225 struct inode *inode = mapping->host;
225 struct ubifs_info *c = inode->i_sb->s_fs_info; 226 struct ubifs_info *c = inode->i_sb->s_fs_info;
@@ -247,14 +248,14 @@ static int write_begin_slow(struct address_space *mapping,
247 if (unlikely(err)) 248 if (unlikely(err))
248 return err; 249 return err;
249 250
250 page = __grab_cache_page(mapping, index); 251 page = grab_cache_page_write_begin(mapping, index, flags);
251 if (unlikely(!page)) { 252 if (unlikely(!page)) {
252 ubifs_release_budget(c, &req); 253 ubifs_release_budget(c, &req);
253 return -ENOMEM; 254 return -ENOMEM;
254 } 255 }
255 256
256 if (!PageUptodate(page)) { 257 if (!PageUptodate(page)) {
257 if (!(pos & PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE) 258 if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE)
258 SetPageChecked(page); 259 SetPageChecked(page);
259 else { 260 else {
260 err = do_readpage(page); 261 err = do_readpage(page);
@@ -431,20 +432,19 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
431 int uninitialized_var(err), appending = !!(pos + len > inode->i_size); 432 int uninitialized_var(err), appending = !!(pos + len > inode->i_size);
432 struct page *page; 433 struct page *page;
433 434
434
435 ubifs_assert(ubifs_inode(inode)->ui_size == inode->i_size); 435 ubifs_assert(ubifs_inode(inode)->ui_size == inode->i_size);
436 436
437 if (unlikely(c->ro_media)) 437 if (unlikely(c->ro_media))
438 return -EROFS; 438 return -EROFS;
439 439
440 /* Try out the fast-path part first */ 440 /* Try out the fast-path part first */
441 page = __grab_cache_page(mapping, index); 441 page = grab_cache_page_write_begin(mapping, index, flags);
442 if (unlikely(!page)) 442 if (unlikely(!page))
443 return -ENOMEM; 443 return -ENOMEM;
444 444
445 if (!PageUptodate(page)) { 445 if (!PageUptodate(page)) {
446 /* The page is not loaded from the flash */ 446 /* The page is not loaded from the flash */
447 if (!(pos & PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE) 447 if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE)
448 /* 448 /*
449 * We change whole page so no need to load it. But we 449 * We change whole page so no need to load it. But we
450 * have to set the @PG_checked flag to make the further 450 * have to set the @PG_checked flag to make the further
@@ -483,7 +483,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
483 unlock_page(page); 483 unlock_page(page);
484 page_cache_release(page); 484 page_cache_release(page);
485 485
486 return write_begin_slow(mapping, pos, len, pagep); 486 return write_begin_slow(mapping, pos, len, pagep, flags);
487 } 487 }
488 488
489 /* 489 /*
@@ -1540,7 +1540,7 @@ static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma)
1540 return 0; 1540 return 0;
1541} 1541}
1542 1542
1543struct address_space_operations ubifs_file_address_operations = { 1543const struct address_space_operations ubifs_file_address_operations = {
1544 .readpage = ubifs_readpage, 1544 .readpage = ubifs_readpage,
1545 .writepage = ubifs_writepage, 1545 .writepage = ubifs_writepage,
1546 .write_begin = ubifs_write_begin, 1546 .write_begin = ubifs_write_begin,
@@ -1550,7 +1550,7 @@ struct address_space_operations ubifs_file_address_operations = {
1550 .releasepage = ubifs_releasepage, 1550 .releasepage = ubifs_releasepage,
1551}; 1551};
1552 1552
1553struct inode_operations ubifs_file_inode_operations = { 1553const struct inode_operations ubifs_file_inode_operations = {
1554 .setattr = ubifs_setattr, 1554 .setattr = ubifs_setattr,
1555 .getattr = ubifs_getattr, 1555 .getattr = ubifs_getattr,
1556#ifdef CONFIG_UBIFS_FS_XATTR 1556#ifdef CONFIG_UBIFS_FS_XATTR
@@ -1561,14 +1561,14 @@ struct inode_operations ubifs_file_inode_operations = {
1561#endif 1561#endif
1562}; 1562};
1563 1563
1564struct inode_operations ubifs_symlink_inode_operations = { 1564const struct inode_operations ubifs_symlink_inode_operations = {
1565 .readlink = generic_readlink, 1565 .readlink = generic_readlink,
1566 .follow_link = ubifs_follow_link, 1566 .follow_link = ubifs_follow_link,
1567 .setattr = ubifs_setattr, 1567 .setattr = ubifs_setattr,
1568 .getattr = ubifs_getattr, 1568 .getattr = ubifs_getattr,
1569}; 1569};
1570 1570
1571struct file_operations ubifs_file_operations = { 1571const struct file_operations ubifs_file_operations = {
1572 .llseek = generic_file_llseek, 1572 .llseek = generic_file_llseek,
1573 .read = do_sync_read, 1573 .read = do_sync_read,
1574 .write = do_sync_write, 1574 .write = do_sync_write,
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index 0bef6501d58a..a711d33b3d3e 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -31,6 +31,26 @@
31 * to be reused. Garbage collection will cause the number of dirty index nodes 31 * to be reused. Garbage collection will cause the number of dirty index nodes
32 * to grow, however sufficient space is reserved for the index to ensure the 32 * to grow, however sufficient space is reserved for the index to ensure the
33 * commit will never run out of space. 33 * commit will never run out of space.
34 *
35 * Notes about dead watermark. At current UBIFS implementation we assume that
36 * LEBs which have less than @c->dead_wm bytes of free + dirty space are full
37 * and not worth garbage-collecting. The dead watermark is one min. I/O unit
38 * size, or min. UBIFS node size, depending on what is greater. Indeed, UBIFS
39 * Garbage Collector has to synchronize the GC head's write buffer before
40 * returning, so this is about wasting one min. I/O unit. However, UBIFS GC can
41 * actually reclaim even very small pieces of dirty space by garbage collecting
42 * enough dirty LEBs, but we do not bother doing this at this implementation.
43 *
44 * Notes about dark watermark. The results of GC work depends on how big are
45 * the UBIFS nodes GC deals with. Large nodes make GC waste more space. Indeed,
46 * if GC move data from LEB A to LEB B and nodes in LEB A are large, GC would
47 * have to waste large pieces of free space at the end of LEB B, because nodes
48 * from LEB A would not fit. And the worst situation is when all nodes are of
49 * maximum size. So dark watermark is the amount of free + dirty space in LEB
50 * which are guaranteed to be reclaimable. If LEB has less space, the GC migh
51 * be unable to reclaim it. So, LEBs with free + dirty greater than dark
52 * watermark are "good" LEBs from GC's point of few. The other LEBs are not so
53 * good, and GC takes extra care when moving them.
34 */ 54 */
35 55
36#include <linux/pagemap.h> 56#include <linux/pagemap.h>
@@ -45,7 +65,7 @@
45#define SMALL_NODE_WM UBIFS_MAX_DENT_NODE_SZ 65#define SMALL_NODE_WM UBIFS_MAX_DENT_NODE_SZ
46 66
47/* 67/*
48 * GC may need to move more then one LEB to make progress. The below constants 68 * GC may need to move more than one LEB to make progress. The below constants
49 * define "soft" and "hard" limits on the number of LEBs the garbage collector 69 * define "soft" and "hard" limits on the number of LEBs the garbage collector
50 * may move. 70 * may move.
51 */ 71 */
@@ -381,7 +401,7 @@ int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp)
381 401
382 /* 402 /*
383 * Don't release the LEB until after the next commit, because 403 * Don't release the LEB until after the next commit, because
384 * it may contain date which is needed for recovery. So 404 * it may contain data which is needed for recovery. So
385 * although we freed this LEB, it will become usable only after 405 * although we freed this LEB, it will become usable only after
386 * the commit. 406 * the commit.
387 */ 407 */
@@ -810,8 +830,9 @@ out:
810 * ubifs_destroy_idx_gc - destroy idx_gc list. 830 * ubifs_destroy_idx_gc - destroy idx_gc list.
811 * @c: UBIFS file-system description object 831 * @c: UBIFS file-system description object
812 * 832 *
813 * This function destroys the idx_gc list. It is called when unmounting or 833 * This function destroys the @c->idx_gc list. It is called when unmounting
814 * remounting read-only so locks are not needed. 834 * so locks are not needed. Returns zero in case of success and a negative
835 * error code in case of failure.
815 */ 836 */
816void ubifs_destroy_idx_gc(struct ubifs_info *c) 837void ubifs_destroy_idx_gc(struct ubifs_info *c)
817{ 838{
@@ -824,7 +845,6 @@ void ubifs_destroy_idx_gc(struct ubifs_info *c)
824 list_del(&idx_gc->list); 845 list_del(&idx_gc->list);
825 kfree(idx_gc); 846 kfree(idx_gc);
826 } 847 }
827
828} 848}
829 849
830/** 850/**
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index 01682713af69..e8e632a1dcdf 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -29,7 +29,7 @@
29 * would have been wasted for padding to the nearest minimal I/O unit boundary. 29 * would have been wasted for padding to the nearest minimal I/O unit boundary.
30 * Instead, data first goes to the write-buffer and is flushed when the 30 * Instead, data first goes to the write-buffer and is flushed when the
31 * buffer is full or when it is not used for some time (by timer). This is 31 * buffer is full or when it is not used for some time (by timer). This is
32 * similarto the mechanism is used by JFFS2. 32 * similar to the mechanism is used by JFFS2.
33 * 33 *
34 * Write-buffers are defined by 'struct ubifs_wbuf' objects and protected by 34 * Write-buffers are defined by 'struct ubifs_wbuf' objects and protected by
35 * mutexes defined inside these objects. Since sometimes upper-level code 35 * mutexes defined inside these objects. Since sometimes upper-level code
@@ -75,7 +75,7 @@ void ubifs_ro_mode(struct ubifs_info *c, int err)
75 * @lnum: logical eraseblock number 75 * @lnum: logical eraseblock number
76 * @offs: offset within the logical eraseblock 76 * @offs: offset within the logical eraseblock
77 * @quiet: print no messages 77 * @quiet: print no messages
78 * @chk_crc: indicates whether to always check the CRC 78 * @must_chk_crc: indicates whether to always check the CRC
79 * 79 *
80 * This function checks node magic number and CRC checksum. This function also 80 * This function checks node magic number and CRC checksum. This function also
81 * validates node length to prevent UBIFS from becoming crazy when an attacker 81 * validates node length to prevent UBIFS from becoming crazy when an attacker
@@ -83,11 +83,17 @@ void ubifs_ro_mode(struct ubifs_info *c, int err)
83 * node length in the common header could cause UBIFS to read memory outside of 83 * node length in the common header could cause UBIFS to read memory outside of
84 * allocated buffer when checking the CRC checksum. 84 * allocated buffer when checking the CRC checksum.
85 * 85 *
86 * This function returns zero in case of success %-EUCLEAN in case of bad CRC 86 * This function may skip data nodes CRC checking if @c->no_chk_data_crc is
87 * or magic. 87 * true, which is controlled by corresponding UBIFS mount option. However, if
88 * @must_chk_crc is true, then @c->no_chk_data_crc is ignored and CRC is
89 * checked. Similarly, if @c->always_chk_crc is true, @c->no_chk_data_crc is
90 * ignored and CRC is checked.
91 *
92 * This function returns zero in case of success and %-EUCLEAN in case of bad
93 * CRC or magic.
88 */ 94 */
89int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum, 95int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
90 int offs, int quiet, int chk_crc) 96 int offs, int quiet, int must_chk_crc)
91{ 97{
92 int err = -EINVAL, type, node_len; 98 int err = -EINVAL, type, node_len;
93 uint32_t crc, node_crc, magic; 99 uint32_t crc, node_crc, magic;
@@ -123,9 +129,9 @@ int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
123 node_len > c->ranges[type].max_len) 129 node_len > c->ranges[type].max_len)
124 goto out_len; 130 goto out_len;
125 131
126 if (!chk_crc && type == UBIFS_DATA_NODE && !c->always_chk_crc) 132 if (!must_chk_crc && type == UBIFS_DATA_NODE && !c->always_chk_crc &&
127 if (c->no_chk_data_crc) 133 c->no_chk_data_crc)
128 return 0; 134 return 0;
129 135
130 crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8); 136 crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8);
131 node_crc = le32_to_cpu(ch->crc); 137 node_crc = le32_to_cpu(ch->crc);
diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c
index 5e82cffe9695..6db7a6be6c97 100644
--- a/fs/ubifs/ioctl.c
+++ b/fs/ubifs/ioctl.c
@@ -154,6 +154,7 @@ long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
154 case FS_IOC_GETFLAGS: 154 case FS_IOC_GETFLAGS:
155 flags = ubifs2ioctl(ubifs_inode(inode)->flags); 155 flags = ubifs2ioctl(ubifs_inode(inode)->flags);
156 156
157 dbg_gen("get flags: %#x, i_flags %#x", flags, inode->i_flags);
157 return put_user(flags, (int __user *) arg); 158 return put_user(flags, (int __user *) arg);
158 159
159 case FS_IOC_SETFLAGS: { 160 case FS_IOC_SETFLAGS: {
@@ -176,6 +177,7 @@ long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
176 err = mnt_want_write(file->f_path.mnt); 177 err = mnt_want_write(file->f_path.mnt);
177 if (err) 178 if (err)
178 return err; 179 return err;
180 dbg_gen("set flags: %#x, i_flags %#x", flags, inode->i_flags);
179 err = setflags(inode, flags); 181 err = setflags(inode, flags);
180 mnt_drop_write(file->f_path.mnt); 182 mnt_drop_write(file->f_path.mnt);
181 return err; 183 return err;
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index f91b745908ea..a11ca0958a23 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -191,7 +191,7 @@ again:
191 if (wbuf->lnum != -1 && avail >= len) { 191 if (wbuf->lnum != -1 && avail >= len) {
192 /* 192 /*
193 * Someone else has switched the journal head and we have 193 * Someone else has switched the journal head and we have
194 * enough space now. This happens when more then one process is 194 * enough space now. This happens when more than one process is
195 * trying to write to the same journal head at the same time. 195 * trying to write to the same journal head at the same time.
196 */ 196 */
197 dbg_jnl("return LEB %d back, already have LEB %d:%d", 197 dbg_jnl("return LEB %d back, already have LEB %d:%d",
@@ -208,7 +208,7 @@ again:
208 offs = 0; 208 offs = 0;
209 209
210out: 210out:
211 err = ubifs_wbuf_seek_nolock(wbuf, lnum, offs, UBI_SHORTTERM); 211 err = ubifs_wbuf_seek_nolock(wbuf, lnum, offs, wbuf->dtype);
212 if (err) 212 if (err)
213 goto out_unlock; 213 goto out_unlock;
214 214
@@ -704,7 +704,7 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
704 data->size = cpu_to_le32(len); 704 data->size = cpu_to_le32(len);
705 zero_data_node_unused(data); 705 zero_data_node_unused(data);
706 706
707 if (!(ui->flags && UBIFS_COMPR_FL)) 707 if (!(ui->flags & UBIFS_COMPR_FL))
708 /* Compression is disabled for this inode */ 708 /* Compression is disabled for this inode */
709 compr_type = UBIFS_COMPR_NONE; 709 compr_type = UBIFS_COMPR_NONE;
710 else 710 else
@@ -1220,7 +1220,7 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
1220 data_key_init(c, &key, inum, blk); 1220 data_key_init(c, &key, inum, blk);
1221 1221
1222 bit = old_size & (UBIFS_BLOCK_SIZE - 1); 1222 bit = old_size & (UBIFS_BLOCK_SIZE - 1);
1223 blk = (old_size >> UBIFS_BLOCK_SHIFT) - (bit ? 0: 1); 1223 blk = (old_size >> UBIFS_BLOCK_SHIFT) - (bit ? 0 : 1);
1224 data_key_init(c, &to_key, inum, blk); 1224 data_key_init(c, &to_key, inum, blk);
1225 1225
1226 err = ubifs_tnc_remove_range(c, &key, &to_key); 1226 err = ubifs_tnc_remove_range(c, &key, &to_key);
diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h
index 3f1f16bc25c9..efb3430a2581 100644
--- a/fs/ubifs/key.h
+++ b/fs/ubifs/key.h
@@ -38,6 +38,22 @@
38#define __UBIFS_KEY_H__ 38#define __UBIFS_KEY_H__
39 39
40/** 40/**
41 * key_mask_hash - mask a valid hash value.
42 * @val: value to be masked
43 *
44 * We use hash values as offset in directories, so values %0 and %1 are
45 * reserved for "." and "..". %2 is reserved for "end of readdir" marker. This
46 * function makes sure the reserved values are not used.
47 */
48static inline uint32_t key_mask_hash(uint32_t hash)
49{
50 hash &= UBIFS_S_KEY_HASH_MASK;
51 if (unlikely(hash <= 2))
52 hash += 3;
53 return hash;
54}
55
56/**
41 * key_r5_hash - R5 hash function (borrowed from reiserfs). 57 * key_r5_hash - R5 hash function (borrowed from reiserfs).
42 * @s: direntry name 58 * @s: direntry name
43 * @len: name length 59 * @len: name length
@@ -54,16 +70,7 @@ static inline uint32_t key_r5_hash(const char *s, int len)
54 str++; 70 str++;
55 } 71 }
56 72
57 a &= UBIFS_S_KEY_HASH_MASK; 73 return key_mask_hash(a);
58
59 /*
60 * We use hash values as offset in directories, so values %0 and %1 are
61 * reserved for "." and "..". %2 is reserved for "end of readdir"
62 * marker.
63 */
64 if (unlikely(a >= 0 && a <= 2))
65 a += 3;
66 return a;
67} 74}
68 75
69/** 76/**
@@ -77,10 +84,7 @@ static inline uint32_t key_test_hash(const char *str, int len)
77 84
78 len = min_t(uint32_t, len, 4); 85 len = min_t(uint32_t, len, 4);
79 memcpy(&a, str, len); 86 memcpy(&a, str, len);
80 a &= UBIFS_S_KEY_HASH_MASK; 87 return key_mask_hash(a);
81 if (unlikely(a >= 0 && a <= 2))
82 a += 3;
83 return a;
84} 88}
85 89
86/** 90/**
diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
index f27176e9b70d..4cdd284dea56 100644
--- a/fs/ubifs/lprops.c
+++ b/fs/ubifs/lprops.c
@@ -520,13 +520,13 @@ static int is_lprops_dirty(struct ubifs_info *c, struct ubifs_lprops *lprops)
520 * @flags: new flags 520 * @flags: new flags
521 * @idx_gc_cnt: change to the count of idx_gc list 521 * @idx_gc_cnt: change to the count of idx_gc list
522 * 522 *
523 * This function changes LEB properties. This function does not change a LEB 523 * This function changes LEB properties (@free, @dirty or @flag). However, the
524 * property (@free, @dirty or @flag) if the value passed is %LPROPS_NC. 524 * property which has the %LPROPS_NC value is not changed. Returns a pointer to
525 * the updated LEB properties on success and a negative error code on failure.
525 * 526 *
526 * This function returns a pointer to the updated LEB properties on success 527 * Note, the LEB properties may have had to be copied (due to COW) and
527 * and a negative error code on failure. N.B. the LEB properties may have had to 528 * consequently the pointer returned may not be the same as the pointer
528 * be copied (due to COW) and consequently the pointer returned may not be the 529 * passed.
529 * same as the pointer passed.
530 */ 530 */
531const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c, 531const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
532 const struct ubifs_lprops *lp, 532 const struct ubifs_lprops *lp,
@@ -635,10 +635,10 @@ const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
635 * @c: UBIFS file-system description object 635 * @c: UBIFS file-system description object
636 * @st: return statistics 636 * @st: return statistics
637 */ 637 */
638void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *st) 638void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *lst)
639{ 639{
640 spin_lock(&c->space_lock); 640 spin_lock(&c->space_lock);
641 memcpy(st, &c->lst, sizeof(struct ubifs_lp_stats)); 641 memcpy(lst, &c->lst, sizeof(struct ubifs_lp_stats));
642 spin_unlock(&c->space_lock); 642 spin_unlock(&c->space_lock);
643} 643}
644 644
@@ -678,6 +678,9 @@ int ubifs_change_one_lp(struct ubifs_info *c, int lnum, int free, int dirty,
678 678
679out: 679out:
680 ubifs_release_lprops(c); 680 ubifs_release_lprops(c);
681 if (err)
682 ubifs_err("cannot change properties of LEB %d, error %d",
683 lnum, err);
681 return err; 684 return err;
682} 685}
683 686
@@ -714,6 +717,9 @@ int ubifs_update_one_lp(struct ubifs_info *c, int lnum, int free, int dirty,
714 717
715out: 718out:
716 ubifs_release_lprops(c); 719 ubifs_release_lprops(c);
720 if (err)
721 ubifs_err("cannot update properties of LEB %d, error %d",
722 lnum, err);
717 return err; 723 return err;
718} 724}
719 725
@@ -737,6 +743,8 @@ int ubifs_read_one_lp(struct ubifs_info *c, int lnum, struct ubifs_lprops *lp)
737 lpp = ubifs_lpt_lookup(c, lnum); 743 lpp = ubifs_lpt_lookup(c, lnum);
738 if (IS_ERR(lpp)) { 744 if (IS_ERR(lpp)) {
739 err = PTR_ERR(lpp); 745 err = PTR_ERR(lpp);
746 ubifs_err("cannot read properties of LEB %d, error %d",
747 lnum, err);
740 goto out; 748 goto out;
741 } 749 }
742 750
@@ -1088,7 +1096,7 @@ static int scan_check_cb(struct ubifs_info *c,
1088 } 1096 }
1089 } 1097 }
1090 1098
1091 sleb = ubifs_scan(c, lnum, 0, c->dbg_buf); 1099 sleb = ubifs_scan(c, lnum, 0, c->dbg->buf);
1092 if (IS_ERR(sleb)) { 1100 if (IS_ERR(sleb)) {
1093 /* 1101 /*
1094 * After an unclean unmount, empty and freeable LEBs 1102 * After an unclean unmount, empty and freeable LEBs
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
index db8bd0e518b2..b2792e84d245 100644
--- a/fs/ubifs/lpt.c
+++ b/fs/ubifs/lpt.c
@@ -36,15 +36,16 @@
36 * can be written into a single eraseblock. In that case, garbage collection 36 * can be written into a single eraseblock. In that case, garbage collection
37 * consists of just writing the whole table, which therefore makes all other 37 * consists of just writing the whole table, which therefore makes all other
38 * eraseblocks reusable. In the case of the big model, dirty eraseblocks are 38 * eraseblocks reusable. In the case of the big model, dirty eraseblocks are
39 * selected for garbage collection, which consists are marking the nodes in 39 * selected for garbage collection, which consists of marking the clean nodes in
40 * that LEB as dirty, and then only the dirty nodes are written out. Also, in 40 * that LEB as dirty, and then only the dirty nodes are written out. Also, in
41 * the case of the big model, a table of LEB numbers is saved so that the entire 41 * the case of the big model, a table of LEB numbers is saved so that the entire
42 * LPT does not to be scanned looking for empty eraseblocks when UBIFS is first 42 * LPT does not to be scanned looking for empty eraseblocks when UBIFS is first
43 * mounted. 43 * mounted.
44 */ 44 */
45 45
46#include <linux/crc16.h>
47#include "ubifs.h" 46#include "ubifs.h"
47#include <linux/crc16.h>
48#include <linux/math64.h>
48 49
49/** 50/**
50 * do_calc_lpt_geom - calculate sizes for the LPT area. 51 * do_calc_lpt_geom - calculate sizes for the LPT area.
@@ -135,15 +136,13 @@ static void do_calc_lpt_geom(struct ubifs_info *c)
135int ubifs_calc_lpt_geom(struct ubifs_info *c) 136int ubifs_calc_lpt_geom(struct ubifs_info *c)
136{ 137{
137 int lebs_needed; 138 int lebs_needed;
138 uint64_t sz; 139 long long sz;
139 140
140 do_calc_lpt_geom(c); 141 do_calc_lpt_geom(c);
141 142
142 /* Verify that lpt_lebs is big enough */ 143 /* Verify that lpt_lebs is big enough */
143 sz = c->lpt_sz * 2; /* Must have at least 2 times the size */ 144 sz = c->lpt_sz * 2; /* Must have at least 2 times the size */
144 sz += c->leb_size - 1; 145 lebs_needed = div_u64(sz + c->leb_size - 1, c->leb_size);
145 do_div(sz, c->leb_size);
146 lebs_needed = sz;
147 if (lebs_needed > c->lpt_lebs) { 146 if (lebs_needed > c->lpt_lebs) {
148 ubifs_err("too few LPT LEBs"); 147 ubifs_err("too few LPT LEBs");
149 return -EINVAL; 148 return -EINVAL;
@@ -156,7 +155,6 @@ int ubifs_calc_lpt_geom(struct ubifs_info *c)
156 } 155 }
157 156
158 c->check_lpt_free = c->big_lpt; 157 c->check_lpt_free = c->big_lpt;
159
160 return 0; 158 return 0;
161} 159}
162 160
@@ -176,7 +174,7 @@ static int calc_dflt_lpt_geom(struct ubifs_info *c, int *main_lebs,
176 int *big_lpt) 174 int *big_lpt)
177{ 175{
178 int i, lebs_needed; 176 int i, lebs_needed;
179 uint64_t sz; 177 long long sz;
180 178
181 /* Start by assuming the minimum number of LPT LEBs */ 179 /* Start by assuming the minimum number of LPT LEBs */
182 c->lpt_lebs = UBIFS_MIN_LPT_LEBS; 180 c->lpt_lebs = UBIFS_MIN_LPT_LEBS;
@@ -203,9 +201,7 @@ static int calc_dflt_lpt_geom(struct ubifs_info *c, int *main_lebs,
203 /* Now check there are enough LPT LEBs */ 201 /* Now check there are enough LPT LEBs */
204 for (i = 0; i < 64 ; i++) { 202 for (i = 0; i < 64 ; i++) {
205 sz = c->lpt_sz * 4; /* Allow 4 times the size */ 203 sz = c->lpt_sz * 4; /* Allow 4 times the size */
206 sz += c->leb_size - 1; 204 lebs_needed = div_u64(sz + c->leb_size - 1, c->leb_size);
207 do_div(sz, c->leb_size);
208 lebs_needed = sz;
209 if (lebs_needed > c->lpt_lebs) { 205 if (lebs_needed > c->lpt_lebs) {
210 /* Not enough LPT LEBs so try again with more */ 206 /* Not enough LPT LEBs so try again with more */
211 c->lpt_lebs = lebs_needed; 207 c->lpt_lebs = lebs_needed;
@@ -558,7 +554,7 @@ static int calc_nnode_num(int row, int col)
558 * This function calculates and returns the nnode number based on the parent's 554 * This function calculates and returns the nnode number based on the parent's
559 * nnode number and the index in parent. 555 * nnode number and the index in parent.
560 */ 556 */
561static int calc_nnode_num_from_parent(struct ubifs_info *c, 557static int calc_nnode_num_from_parent(const struct ubifs_info *c,
562 struct ubifs_nnode *parent, int iip) 558 struct ubifs_nnode *parent, int iip)
563{ 559{
564 int num, shft; 560 int num, shft;
@@ -583,7 +579,7 @@ static int calc_nnode_num_from_parent(struct ubifs_info *c,
583 * This function calculates and returns the pnode number based on the parent's 579 * This function calculates and returns the pnode number based on the parent's
584 * nnode number and the index in parent. 580 * nnode number and the index in parent.
585 */ 581 */
586static int calc_pnode_num_from_parent(struct ubifs_info *c, 582static int calc_pnode_num_from_parent(const struct ubifs_info *c,
587 struct ubifs_nnode *parent, int iip) 583 struct ubifs_nnode *parent, int iip)
588{ 584{
589 int i, n = c->lpt_hght - 1, pnum = parent->num, num = 0; 585 int i, n = c->lpt_hght - 1, pnum = parent->num, num = 0;
@@ -966,7 +962,7 @@ static int check_lpt_type(uint8_t **addr, int *pos, int type)
966 * 962 *
967 * This function returns %0 on success and a negative error code on failure. 963 * This function returns %0 on success and a negative error code on failure.
968 */ 964 */
969static int unpack_pnode(struct ubifs_info *c, void *buf, 965static int unpack_pnode(const struct ubifs_info *c, void *buf,
970 struct ubifs_pnode *pnode) 966 struct ubifs_pnode *pnode)
971{ 967{
972 uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; 968 uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
@@ -996,15 +992,15 @@ static int unpack_pnode(struct ubifs_info *c, void *buf,
996} 992}
997 993
998/** 994/**
999 * unpack_nnode - unpack a nnode. 995 * ubifs_unpack_nnode - unpack a nnode.
1000 * @c: UBIFS file-system description object 996 * @c: UBIFS file-system description object
1001 * @buf: buffer containing packed nnode to unpack 997 * @buf: buffer containing packed nnode to unpack
1002 * @nnode: nnode structure to fill 998 * @nnode: nnode structure to fill
1003 * 999 *
1004 * This function returns %0 on success and a negative error code on failure. 1000 * This function returns %0 on success and a negative error code on failure.
1005 */ 1001 */
1006static int unpack_nnode(struct ubifs_info *c, void *buf, 1002int ubifs_unpack_nnode(const struct ubifs_info *c, void *buf,
1007 struct ubifs_nnode *nnode) 1003 struct ubifs_nnode *nnode)
1008{ 1004{
1009 uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; 1005 uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
1010 int i, pos = 0, err; 1006 int i, pos = 0, err;
@@ -1036,7 +1032,7 @@ static int unpack_nnode(struct ubifs_info *c, void *buf,
1036 * 1032 *
1037 * This function returns %0 on success and a negative error code on failure. 1033 * This function returns %0 on success and a negative error code on failure.
1038 */ 1034 */
1039static int unpack_ltab(struct ubifs_info *c, void *buf) 1035static int unpack_ltab(const struct ubifs_info *c, void *buf)
1040{ 1036{
1041 uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; 1037 uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
1042 int i, pos = 0, err; 1038 int i, pos = 0, err;
@@ -1068,7 +1064,7 @@ static int unpack_ltab(struct ubifs_info *c, void *buf)
1068 * 1064 *
1069 * This function returns %0 on success and a negative error code on failure. 1065 * This function returns %0 on success and a negative error code on failure.
1070 */ 1066 */
1071static int unpack_lsave(struct ubifs_info *c, void *buf) 1067static int unpack_lsave(const struct ubifs_info *c, void *buf)
1072{ 1068{
1073 uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; 1069 uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
1074 int i, pos = 0, err; 1070 int i, pos = 0, err;
@@ -1096,7 +1092,7 @@ static int unpack_lsave(struct ubifs_info *c, void *buf)
1096 * 1092 *
1097 * This function returns %0 on success and a negative error code on failure. 1093 * This function returns %0 on success and a negative error code on failure.
1098 */ 1094 */
1099static int validate_nnode(struct ubifs_info *c, struct ubifs_nnode *nnode, 1095static int validate_nnode(const struct ubifs_info *c, struct ubifs_nnode *nnode,
1100 struct ubifs_nnode *parent, int iip) 1096 struct ubifs_nnode *parent, int iip)
1101{ 1097{
1102 int i, lvl, max_offs; 1098 int i, lvl, max_offs;
@@ -1140,7 +1136,7 @@ static int validate_nnode(struct ubifs_info *c, struct ubifs_nnode *nnode,
1140 * 1136 *
1141 * This function returns %0 on success and a negative error code on failure. 1137 * This function returns %0 on success and a negative error code on failure.
1142 */ 1138 */
1143static int validate_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, 1139static int validate_pnode(const struct ubifs_info *c, struct ubifs_pnode *pnode,
1144 struct ubifs_nnode *parent, int iip) 1140 struct ubifs_nnode *parent, int iip)
1145{ 1141{
1146 int i; 1142 int i;
@@ -1174,7 +1170,8 @@ static int validate_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
1174 * This function calculates the LEB numbers for the LEB properties it contains 1170 * This function calculates the LEB numbers for the LEB properties it contains
1175 * based on the pnode number. 1171 * based on the pnode number.
1176 */ 1172 */
1177static void set_pnode_lnum(struct ubifs_info *c, struct ubifs_pnode *pnode) 1173static void set_pnode_lnum(const struct ubifs_info *c,
1174 struct ubifs_pnode *pnode)
1178{ 1175{
1179 int i, lnum; 1176 int i, lnum;
1180 1177
@@ -1227,7 +1224,7 @@ int ubifs_read_nnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip)
1227 err = ubi_read(c->ubi, lnum, buf, offs, c->nnode_sz); 1224 err = ubi_read(c->ubi, lnum, buf, offs, c->nnode_sz);
1228 if (err) 1225 if (err)
1229 goto out; 1226 goto out;
1230 err = unpack_nnode(c, buf, nnode); 1227 err = ubifs_unpack_nnode(c, buf, nnode);
1231 if (err) 1228 if (err)
1232 goto out; 1229 goto out;
1233 } 1230 }
@@ -1816,7 +1813,7 @@ static struct ubifs_nnode *scan_get_nnode(struct ubifs_info *c,
1816 c->nnode_sz); 1813 c->nnode_sz);
1817 if (err) 1814 if (err)
1818 return ERR_PTR(err); 1815 return ERR_PTR(err);
1819 err = unpack_nnode(c, buf, nnode); 1816 err = ubifs_unpack_nnode(c, buf, nnode);
1820 if (err) 1817 if (err)
1821 return ERR_PTR(err); 1818 return ERR_PTR(err);
1822 } 1819 }
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index a41434b42785..3216a1f277f8 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -320,6 +320,8 @@ no_space:
320 dbg_err("LPT out of space at LEB %d:%d needing %d, done_ltab %d, " 320 dbg_err("LPT out of space at LEB %d:%d needing %d, done_ltab %d, "
321 "done_lsave %d", lnum, offs, len, done_ltab, done_lsave); 321 "done_lsave %d", lnum, offs, len, done_ltab, done_lsave);
322 dbg_dump_lpt_info(c); 322 dbg_dump_lpt_info(c);
323 dbg_dump_lpt_lebs(c);
324 dump_stack();
323 return err; 325 return err;
324} 326}
325 327
@@ -546,29 +548,31 @@ static int write_cnodes(struct ubifs_info *c)
546no_space: 548no_space:
547 ubifs_err("LPT out of space mismatch"); 549 ubifs_err("LPT out of space mismatch");
548 dbg_err("LPT out of space mismatch at LEB %d:%d needing %d, done_ltab " 550 dbg_err("LPT out of space mismatch at LEB %d:%d needing %d, done_ltab "
549 "%d, done_lsave %d", lnum, offs, len, done_ltab, done_lsave); 551 "%d, done_lsave %d", lnum, offs, len, done_ltab, done_lsave);
550 dbg_dump_lpt_info(c); 552 dbg_dump_lpt_info(c);
553 dbg_dump_lpt_lebs(c);
554 dump_stack();
551 return err; 555 return err;
552} 556}
553 557
554/** 558/**
555 * next_pnode - find next pnode. 559 * next_pnode_to_dirty - find next pnode to dirty.
556 * @c: UBIFS file-system description object 560 * @c: UBIFS file-system description object
557 * @pnode: pnode 561 * @pnode: pnode
558 * 562 *
559 * This function returns the next pnode or %NULL if there are no more pnodes. 563 * This function returns the next pnode to dirty or %NULL if there are no more
564 * pnodes. Note that pnodes that have never been written (lnum == 0) are
565 * skipped.
560 */ 566 */
561static struct ubifs_pnode *next_pnode(struct ubifs_info *c, 567static struct ubifs_pnode *next_pnode_to_dirty(struct ubifs_info *c,
562 struct ubifs_pnode *pnode) 568 struct ubifs_pnode *pnode)
563{ 569{
564 struct ubifs_nnode *nnode; 570 struct ubifs_nnode *nnode;
565 int iip; 571 int iip;
566 572
567 /* Try to go right */ 573 /* Try to go right */
568 nnode = pnode->parent; 574 nnode = pnode->parent;
569 iip = pnode->iip + 1; 575 for (iip = pnode->iip + 1; iip < UBIFS_LPT_FANOUT; iip++) {
570 if (iip < UBIFS_LPT_FANOUT) {
571 /* We assume here that LEB zero is never an LPT LEB */
572 if (nnode->nbranch[iip].lnum) 576 if (nnode->nbranch[iip].lnum)
573 return ubifs_get_pnode(c, nnode, iip); 577 return ubifs_get_pnode(c, nnode, iip);
574 } 578 }
@@ -579,8 +583,11 @@ static struct ubifs_pnode *next_pnode(struct ubifs_info *c,
579 nnode = nnode->parent; 583 nnode = nnode->parent;
580 if (!nnode) 584 if (!nnode)
581 return NULL; 585 return NULL;
582 /* We assume here that LEB zero is never an LPT LEB */ 586 for (; iip < UBIFS_LPT_FANOUT; iip++) {
583 } while (iip >= UBIFS_LPT_FANOUT || !nnode->nbranch[iip].lnum); 587 if (nnode->nbranch[iip].lnum)
588 break;
589 }
590 } while (iip >= UBIFS_LPT_FANOUT);
584 591
585 /* Go right */ 592 /* Go right */
586 nnode = ubifs_get_nnode(c, nnode, iip); 593 nnode = ubifs_get_nnode(c, nnode, iip);
@@ -589,12 +596,29 @@ static struct ubifs_pnode *next_pnode(struct ubifs_info *c,
589 596
590 /* Go down to level 1 */ 597 /* Go down to level 1 */
591 while (nnode->level > 1) { 598 while (nnode->level > 1) {
592 nnode = ubifs_get_nnode(c, nnode, 0); 599 for (iip = 0; iip < UBIFS_LPT_FANOUT; iip++) {
600 if (nnode->nbranch[iip].lnum)
601 break;
602 }
603 if (iip >= UBIFS_LPT_FANOUT) {
604 /*
605 * Should not happen, but we need to keep going
606 * if it does.
607 */
608 iip = 0;
609 }
610 nnode = ubifs_get_nnode(c, nnode, iip);
593 if (IS_ERR(nnode)) 611 if (IS_ERR(nnode))
594 return (void *)nnode; 612 return (void *)nnode;
595 } 613 }
596 614
597 return ubifs_get_pnode(c, nnode, 0); 615 for (iip = 0; iip < UBIFS_LPT_FANOUT; iip++)
616 if (nnode->nbranch[iip].lnum)
617 break;
618 if (iip >= UBIFS_LPT_FANOUT)
619 /* Should not happen, but we need to keep going if it does */
620 iip = 0;
621 return ubifs_get_pnode(c, nnode, iip);
598} 622}
599 623
600/** 624/**
@@ -684,7 +708,7 @@ static int make_tree_dirty(struct ubifs_info *c)
684 pnode = pnode_lookup(c, 0); 708 pnode = pnode_lookup(c, 0);
685 while (pnode) { 709 while (pnode) {
686 do_make_pnode_dirty(c, pnode); 710 do_make_pnode_dirty(c, pnode);
687 pnode = next_pnode(c, pnode); 711 pnode = next_pnode_to_dirty(c, pnode);
688 if (IS_ERR(pnode)) 712 if (IS_ERR(pnode))
689 return PTR_ERR(pnode); 713 return PTR_ERR(pnode);
690 } 714 }
@@ -749,7 +773,7 @@ static void lpt_tgc_start(struct ubifs_info *c)
749 * LPT trivial garbage collection is where a LPT LEB contains only dirty and 773 * LPT trivial garbage collection is where a LPT LEB contains only dirty and
750 * free space and so may be reused as soon as the next commit is completed. 774 * free space and so may be reused as soon as the next commit is completed.
751 * This function is called after the commit is completed (master node has been 775 * This function is called after the commit is completed (master node has been
752 * written) and unmaps LPT LEBs that were marked for trivial GC. 776 * written) and un-maps LPT LEBs that were marked for trivial GC.
753 */ 777 */
754static int lpt_tgc_end(struct ubifs_info *c) 778static int lpt_tgc_end(struct ubifs_info *c)
755{ 779{
@@ -1025,7 +1049,7 @@ static int make_node_dirty(struct ubifs_info *c, int node_type, int node_num,
1025 * @c: UBIFS file-system description object 1049 * @c: UBIFS file-system description object
1026 * @node_type: LPT node type 1050 * @node_type: LPT node type
1027 */ 1051 */
1028static int get_lpt_node_len(struct ubifs_info *c, int node_type) 1052static int get_lpt_node_len(const struct ubifs_info *c, int node_type)
1029{ 1053{
1030 switch (node_type) { 1054 switch (node_type) {
1031 case UBIFS_LPT_NNODE: 1055 case UBIFS_LPT_NNODE:
@@ -1046,7 +1070,7 @@ static int get_lpt_node_len(struct ubifs_info *c, int node_type)
1046 * @buf: buffer 1070 * @buf: buffer
1047 * @len: length of buffer 1071 * @len: length of buffer
1048 */ 1072 */
1049static int get_pad_len(struct ubifs_info *c, uint8_t *buf, int len) 1073static int get_pad_len(const struct ubifs_info *c, uint8_t *buf, int len)
1050{ 1074{
1051 int offs, pad_len; 1075 int offs, pad_len;
1052 1076
@@ -1063,7 +1087,8 @@ static int get_pad_len(struct ubifs_info *c, uint8_t *buf, int len)
1063 * @buf: buffer 1087 * @buf: buffer
1064 * @node_num: node number is returned here 1088 * @node_num: node number is returned here
1065 */ 1089 */
1066static int get_lpt_node_type(struct ubifs_info *c, uint8_t *buf, int *node_num) 1090static int get_lpt_node_type(const struct ubifs_info *c, uint8_t *buf,
1091 int *node_num)
1067{ 1092{
1068 uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; 1093 uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
1069 int pos = 0, node_type; 1094 int pos = 0, node_type;
@@ -1081,7 +1106,7 @@ static int get_lpt_node_type(struct ubifs_info *c, uint8_t *buf, int *node_num)
1081 * 1106 *
1082 * This function returns %1 if the buffer contains a node or %0 if it does not. 1107 * This function returns %1 if the buffer contains a node or %0 if it does not.
1083 */ 1108 */
1084static int is_a_node(struct ubifs_info *c, uint8_t *buf, int len) 1109static int is_a_node(const struct ubifs_info *c, uint8_t *buf, int len)
1085{ 1110{
1086 uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; 1111 uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
1087 int pos = 0, node_type, node_len; 1112 int pos = 0, node_type, node_len;
@@ -1105,7 +1130,6 @@ static int is_a_node(struct ubifs_info *c, uint8_t *buf, int len)
1105 return 1; 1130 return 1;
1106} 1131}
1107 1132
1108
1109/** 1133/**
1110 * lpt_gc_lnum - garbage collect a LPT LEB. 1134 * lpt_gc_lnum - garbage collect a LPT LEB.
1111 * @c: UBIFS file-system description object 1135 * @c: UBIFS file-system description object
@@ -1463,7 +1487,7 @@ void ubifs_lpt_free(struct ubifs_info *c, int wr_only)
1463#ifdef CONFIG_UBIFS_FS_DEBUG 1487#ifdef CONFIG_UBIFS_FS_DEBUG
1464 1488
1465/** 1489/**
1466 * dbg_is_all_ff - determine if a buffer contains only 0xff bytes. 1490 * dbg_is_all_ff - determine if a buffer contains only 0xFF bytes.
1467 * @buf: buffer 1491 * @buf: buffer
1468 * @len: buffer length 1492 * @len: buffer length
1469 */ 1493 */
@@ -1488,7 +1512,7 @@ static int dbg_is_nnode_dirty(struct ubifs_info *c, int lnum, int offs)
1488 struct ubifs_nnode *nnode; 1512 struct ubifs_nnode *nnode;
1489 int hght; 1513 int hght;
1490 1514
1491 /* Entire tree is in memory so first_nnode / next_nnode are ok */ 1515 /* Entire tree is in memory so first_nnode / next_nnode are OK */
1492 nnode = first_nnode(c, &hght); 1516 nnode = first_nnode(c, &hght);
1493 for (; nnode; nnode = next_nnode(c, nnode, &hght)) { 1517 for (; nnode; nnode = next_nnode(c, nnode, &hght)) {
1494 struct ubifs_nbranch *branch; 1518 struct ubifs_nbranch *branch;
@@ -1602,7 +1626,10 @@ static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum)
1602{ 1626{
1603 int err, len = c->leb_size, dirty = 0, node_type, node_num, node_len; 1627 int err, len = c->leb_size, dirty = 0, node_type, node_num, node_len;
1604 int ret; 1628 int ret;
1605 void *buf = c->dbg_buf; 1629 void *buf = c->dbg->buf;
1630
1631 if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS))
1632 return 0;
1606 1633
1607 dbg_lp("LEB %d", lnum); 1634 dbg_lp("LEB %d", lnum);
1608 err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size); 1635 err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size);
@@ -1704,6 +1731,9 @@ int dbg_chk_lpt_free_spc(struct ubifs_info *c)
1704 long long free = 0; 1731 long long free = 0;
1705 int i; 1732 int i;
1706 1733
1734 if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS))
1735 return 0;
1736
1707 for (i = 0; i < c->lpt_lebs; i++) { 1737 for (i = 0; i < c->lpt_lebs; i++) {
1708 if (c->ltab[i].tgc || c->ltab[i].cmt) 1738 if (c->ltab[i].tgc || c->ltab[i].cmt)
1709 continue; 1739 continue;
@@ -1716,6 +1746,8 @@ int dbg_chk_lpt_free_spc(struct ubifs_info *c)
1716 dbg_err("LPT space error: free %lld lpt_sz %lld", 1746 dbg_err("LPT space error: free %lld lpt_sz %lld",
1717 free, c->lpt_sz); 1747 free, c->lpt_sz);
1718 dbg_dump_lpt_info(c); 1748 dbg_dump_lpt_info(c);
1749 dbg_dump_lpt_lebs(c);
1750 dump_stack();
1719 return -EINVAL; 1751 return -EINVAL;
1720 } 1752 }
1721 return 0; 1753 return 0;
@@ -1731,15 +1763,19 @@ int dbg_chk_lpt_free_spc(struct ubifs_info *c)
1731 */ 1763 */
1732int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len) 1764int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len)
1733{ 1765{
1766 struct ubifs_debug_info *d = c->dbg;
1734 long long chk_lpt_sz, lpt_sz; 1767 long long chk_lpt_sz, lpt_sz;
1735 int err = 0; 1768 int err = 0;
1736 1769
1770 if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS))
1771 return 0;
1772
1737 switch (action) { 1773 switch (action) {
1738 case 0: 1774 case 0:
1739 c->chk_lpt_sz = 0; 1775 d->chk_lpt_sz = 0;
1740 c->chk_lpt_sz2 = 0; 1776 d->chk_lpt_sz2 = 0;
1741 c->chk_lpt_lebs = 0; 1777 d->chk_lpt_lebs = 0;
1742 c->chk_lpt_wastage = 0; 1778 d->chk_lpt_wastage = 0;
1743 if (c->dirty_pn_cnt > c->pnode_cnt) { 1779 if (c->dirty_pn_cnt > c->pnode_cnt) {
1744 dbg_err("dirty pnodes %d exceed max %d", 1780 dbg_err("dirty pnodes %d exceed max %d",
1745 c->dirty_pn_cnt, c->pnode_cnt); 1781 c->dirty_pn_cnt, c->pnode_cnt);
@@ -1752,35 +1788,35 @@ int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len)
1752 } 1788 }
1753 return err; 1789 return err;
1754 case 1: 1790 case 1:
1755 c->chk_lpt_sz += len; 1791 d->chk_lpt_sz += len;
1756 return 0; 1792 return 0;
1757 case 2: 1793 case 2:
1758 c->chk_lpt_sz += len; 1794 d->chk_lpt_sz += len;
1759 c->chk_lpt_wastage += len; 1795 d->chk_lpt_wastage += len;
1760 c->chk_lpt_lebs += 1; 1796 d->chk_lpt_lebs += 1;
1761 return 0; 1797 return 0;
1762 case 3: 1798 case 3:
1763 chk_lpt_sz = c->leb_size; 1799 chk_lpt_sz = c->leb_size;
1764 chk_lpt_sz *= c->chk_lpt_lebs; 1800 chk_lpt_sz *= d->chk_lpt_lebs;
1765 chk_lpt_sz += len - c->nhead_offs; 1801 chk_lpt_sz += len - c->nhead_offs;
1766 if (c->chk_lpt_sz != chk_lpt_sz) { 1802 if (d->chk_lpt_sz != chk_lpt_sz) {
1767 dbg_err("LPT wrote %lld but space used was %lld", 1803 dbg_err("LPT wrote %lld but space used was %lld",
1768 c->chk_lpt_sz, chk_lpt_sz); 1804 d->chk_lpt_sz, chk_lpt_sz);
1769 err = -EINVAL; 1805 err = -EINVAL;
1770 } 1806 }
1771 if (c->chk_lpt_sz > c->lpt_sz) { 1807 if (d->chk_lpt_sz > c->lpt_sz) {
1772 dbg_err("LPT wrote %lld but lpt_sz is %lld", 1808 dbg_err("LPT wrote %lld but lpt_sz is %lld",
1773 c->chk_lpt_sz, c->lpt_sz); 1809 d->chk_lpt_sz, c->lpt_sz);
1774 err = -EINVAL; 1810 err = -EINVAL;
1775 } 1811 }
1776 if (c->chk_lpt_sz2 && c->chk_lpt_sz != c->chk_lpt_sz2) { 1812 if (d->chk_lpt_sz2 && d->chk_lpt_sz != d->chk_lpt_sz2) {
1777 dbg_err("LPT layout size %lld but wrote %lld", 1813 dbg_err("LPT layout size %lld but wrote %lld",
1778 c->chk_lpt_sz, c->chk_lpt_sz2); 1814 d->chk_lpt_sz, d->chk_lpt_sz2);
1779 err = -EINVAL; 1815 err = -EINVAL;
1780 } 1816 }
1781 if (c->chk_lpt_sz2 && c->new_nhead_offs != len) { 1817 if (d->chk_lpt_sz2 && d->new_nhead_offs != len) {
1782 dbg_err("LPT new nhead offs: expected %d was %d", 1818 dbg_err("LPT new nhead offs: expected %d was %d",
1783 c->new_nhead_offs, len); 1819 d->new_nhead_offs, len);
1784 err = -EINVAL; 1820 err = -EINVAL;
1785 } 1821 }
1786 lpt_sz = (long long)c->pnode_cnt * c->pnode_sz; 1822 lpt_sz = (long long)c->pnode_cnt * c->pnode_sz;
@@ -1788,26 +1824,146 @@ int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len)
1788 lpt_sz += c->ltab_sz; 1824 lpt_sz += c->ltab_sz;
1789 if (c->big_lpt) 1825 if (c->big_lpt)
1790 lpt_sz += c->lsave_sz; 1826 lpt_sz += c->lsave_sz;
1791 if (c->chk_lpt_sz - c->chk_lpt_wastage > lpt_sz) { 1827 if (d->chk_lpt_sz - d->chk_lpt_wastage > lpt_sz) {
1792 dbg_err("LPT chk_lpt_sz %lld + waste %lld exceeds %lld", 1828 dbg_err("LPT chk_lpt_sz %lld + waste %lld exceeds %lld",
1793 c->chk_lpt_sz, c->chk_lpt_wastage, lpt_sz); 1829 d->chk_lpt_sz, d->chk_lpt_wastage, lpt_sz);
1794 err = -EINVAL; 1830 err = -EINVAL;
1795 } 1831 }
1796 if (err) 1832 if (err) {
1797 dbg_dump_lpt_info(c); 1833 dbg_dump_lpt_info(c);
1798 c->chk_lpt_sz2 = c->chk_lpt_sz; 1834 dbg_dump_lpt_lebs(c);
1799 c->chk_lpt_sz = 0; 1835 dump_stack();
1800 c->chk_lpt_wastage = 0; 1836 }
1801 c->chk_lpt_lebs = 0; 1837 d->chk_lpt_sz2 = d->chk_lpt_sz;
1802 c->new_nhead_offs = len; 1838 d->chk_lpt_sz = 0;
1839 d->chk_lpt_wastage = 0;
1840 d->chk_lpt_lebs = 0;
1841 d->new_nhead_offs = len;
1803 return err; 1842 return err;
1804 case 4: 1843 case 4:
1805 c->chk_lpt_sz += len; 1844 d->chk_lpt_sz += len;
1806 c->chk_lpt_wastage += len; 1845 d->chk_lpt_wastage += len;
1807 return 0; 1846 return 0;
1808 default: 1847 default:
1809 return -EINVAL; 1848 return -EINVAL;
1810 } 1849 }
1811} 1850}
1812 1851
1852/**
1853 * dbg_dump_lpt_leb - dump an LPT LEB.
1854 * @c: UBIFS file-system description object
1855 * @lnum: LEB number to dump
1856 *
1857 * This function dumps an LEB from LPT area. Nodes in this area are very
1858 * different to nodes in the main area (e.g., they do not have common headers,
1859 * they do not have 8-byte alignments, etc), so we have a separate function to
1860 * dump LPT area LEBs. Note, LPT has to be locked by the caller.
1861 */
1862static void dump_lpt_leb(const struct ubifs_info *c, int lnum)
1863{
1864 int err, len = c->leb_size, node_type, node_num, node_len, offs;
1865 void *buf = c->dbg->buf;
1866
1867 printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n",
1868 current->pid, lnum);
1869 err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size);
1870 if (err) {
1871 ubifs_err("cannot read LEB %d, error %d", lnum, err);
1872 return;
1873 }
1874 while (1) {
1875 offs = c->leb_size - len;
1876 if (!is_a_node(c, buf, len)) {
1877 int pad_len;
1878
1879 pad_len = get_pad_len(c, buf, len);
1880 if (pad_len) {
1881 printk(KERN_DEBUG "LEB %d:%d, pad %d bytes\n",
1882 lnum, offs, pad_len);
1883 buf += pad_len;
1884 len -= pad_len;
1885 continue;
1886 }
1887 if (len)
1888 printk(KERN_DEBUG "LEB %d:%d, free %d bytes\n",
1889 lnum, offs, len);
1890 break;
1891 }
1892
1893 node_type = get_lpt_node_type(c, buf, &node_num);
1894 switch (node_type) {
1895 case UBIFS_LPT_PNODE:
1896 {
1897 node_len = c->pnode_sz;
1898 if (c->big_lpt)
1899 printk(KERN_DEBUG "LEB %d:%d, pnode num %d\n",
1900 lnum, offs, node_num);
1901 else
1902 printk(KERN_DEBUG "LEB %d:%d, pnode\n",
1903 lnum, offs);
1904 break;
1905 }
1906 case UBIFS_LPT_NNODE:
1907 {
1908 int i;
1909 struct ubifs_nnode nnode;
1910
1911 node_len = c->nnode_sz;
1912 if (c->big_lpt)
1913 printk(KERN_DEBUG "LEB %d:%d, nnode num %d, ",
1914 lnum, offs, node_num);
1915 else
1916 printk(KERN_DEBUG "LEB %d:%d, nnode, ",
1917 lnum, offs);
1918 err = ubifs_unpack_nnode(c, buf, &nnode);
1919 for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
1920 printk("%d:%d", nnode.nbranch[i].lnum,
1921 nnode.nbranch[i].offs);
1922 if (i != UBIFS_LPT_FANOUT - 1)
1923 printk(", ");
1924 }
1925 printk("\n");
1926 break;
1927 }
1928 case UBIFS_LPT_LTAB:
1929 node_len = c->ltab_sz;
1930 printk(KERN_DEBUG "LEB %d:%d, ltab\n",
1931 lnum, offs);
1932 break;
1933 case UBIFS_LPT_LSAVE:
1934 node_len = c->lsave_sz;
1935 printk(KERN_DEBUG "LEB %d:%d, lsave len\n", lnum, offs);
1936 break;
1937 default:
1938 ubifs_err("LPT node type %d not recognized", node_type);
1939 return;
1940 }
1941
1942 buf += node_len;
1943 len -= node_len;
1944 }
1945
1946 printk(KERN_DEBUG "(pid %d) finish dumping LEB %d\n",
1947 current->pid, lnum);
1948}
1949
1950/**
1951 * dbg_dump_lpt_lebs - dump LPT lebs.
1952 * @c: UBIFS file-system description object
1953 *
1954 * This function dumps all LPT LEBs. The caller has to make sure the LPT is
1955 * locked.
1956 */
1957void dbg_dump_lpt_lebs(const struct ubifs_info *c)
1958{
1959 int i;
1960
1961 printk(KERN_DEBUG "(pid %d) start dumping all LPT LEBs\n",
1962 current->pid);
1963 for (i = 0; i < c->lpt_lebs; i++)
1964 dump_lpt_leb(c, i + c->lpt_first);
1965 printk(KERN_DEBUG "(pid %d) finish dumping all LPT LEBs\n",
1966 current->pid);
1967}
1968
1813#endif /* CONFIG_UBIFS_FS_DEBUG */ 1969#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c
index 71d5493bf565..a88f33801b98 100644
--- a/fs/ubifs/master.c
+++ b/fs/ubifs/master.c
@@ -354,7 +354,7 @@ int ubifs_write_master(struct ubifs_info *c)
354 int err, lnum, offs, len; 354 int err, lnum, offs, len;
355 355
356 if (c->ro_media) 356 if (c->ro_media)
357 return -EINVAL; 357 return -EROFS;
358 358
359 lnum = UBIFS_MST_LNUM; 359 lnum = UBIFS_MST_LNUM;
360 offs = c->mst_offs + c->mst_node_alsz; 360 offs = c->mst_offs + c->mst_node_alsz;
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index 9bd5a43d4526..152a7b34a141 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -46,7 +46,7 @@
46 * Orphans are accumulated in a rb-tree. When an inode's link count drops to 46 * Orphans are accumulated in a rb-tree. When an inode's link count drops to
47 * zero, the inode number is added to the rb-tree. It is removed from the tree 47 * zero, the inode number is added to the rb-tree. It is removed from the tree
48 * when the inode is deleted. Any new orphans that are in the orphan tree when 48 * when the inode is deleted. Any new orphans that are in the orphan tree when
49 * the commit is run, are written to the orphan area in 1 or more orph nodes. 49 * the commit is run, are written to the orphan area in 1 or more orphan nodes.
50 * If the orphan area is full, it is consolidated to make space. There is 50 * If the orphan area is full, it is consolidated to make space. There is
51 * always enough space because validation prevents the user from creating more 51 * always enough space because validation prevents the user from creating more
52 * than the maximum number of orphans allowed. 52 * than the maximum number of orphans allowed.
@@ -231,7 +231,7 @@ static int tot_avail_orphs(struct ubifs_info *c)
231} 231}
232 232
233/** 233/**
234 * do_write_orph_node - write a node 234 * do_write_orph_node - write a node to the orphan head.
235 * @c: UBIFS file-system description object 235 * @c: UBIFS file-system description object
236 * @len: length of node 236 * @len: length of node
237 * @atomic: write atomically 237 * @atomic: write atomically
@@ -264,11 +264,11 @@ static int do_write_orph_node(struct ubifs_info *c, int len, int atomic)
264} 264}
265 265
266/** 266/**
267 * write_orph_node - write an orph node 267 * write_orph_node - write an orphan node.
268 * @c: UBIFS file-system description object 268 * @c: UBIFS file-system description object
269 * @atomic: write atomically 269 * @atomic: write atomically
270 * 270 *
271 * This function builds an orph node from the cnext list and writes it to the 271 * This function builds an orphan node from the cnext list and writes it to the
272 * orphan head. On success, %0 is returned, otherwise a negative error code 272 * orphan head. On success, %0 is returned, otherwise a negative error code
273 * is returned. 273 * is returned.
274 */ 274 */
@@ -326,11 +326,11 @@ static int write_orph_node(struct ubifs_info *c, int atomic)
326} 326}
327 327
328/** 328/**
329 * write_orph_nodes - write orph nodes until there are no more to commit 329 * write_orph_nodes - write orphan nodes until there are no more to commit.
330 * @c: UBIFS file-system description object 330 * @c: UBIFS file-system description object
331 * @atomic: write atomically 331 * @atomic: write atomically
332 * 332 *
333 * This function writes orph nodes for all the orphans to commit. On success, 333 * This function writes orphan nodes for all the orphans to commit. On success,
334 * %0 is returned, otherwise a negative error code is returned. 334 * %0 is returned, otherwise a negative error code is returned.
335 */ 335 */
336static int write_orph_nodes(struct ubifs_info *c, int atomic) 336static int write_orph_nodes(struct ubifs_info *c, int atomic)
@@ -478,14 +478,14 @@ int ubifs_orphan_end_commit(struct ubifs_info *c)
478} 478}
479 479
480/** 480/**
481 * clear_orphans - erase all LEBs used for orphans. 481 * ubifs_clear_orphans - erase all LEBs used for orphans.
482 * @c: UBIFS file-system description object 482 * @c: UBIFS file-system description object
483 * 483 *
484 * If recovery is not required, then the orphans from the previous session 484 * If recovery is not required, then the orphans from the previous session
485 * are not needed. This function locates the LEBs used to record 485 * are not needed. This function locates the LEBs used to record
486 * orphans, and un-maps them. 486 * orphans, and un-maps them.
487 */ 487 */
488static int clear_orphans(struct ubifs_info *c) 488int ubifs_clear_orphans(struct ubifs_info *c)
489{ 489{
490 int lnum, err; 490 int lnum, err;
491 491
@@ -547,9 +547,9 @@ static int insert_dead_orphan(struct ubifs_info *c, ino_t inum)
547 * do_kill_orphans - remove orphan inodes from the index. 547 * do_kill_orphans - remove orphan inodes from the index.
548 * @c: UBIFS file-system description object 548 * @c: UBIFS file-system description object
549 * @sleb: scanned LEB 549 * @sleb: scanned LEB
550 * @last_cmt_no: cmt_no of last orph node read is passed and returned here 550 * @last_cmt_no: cmt_no of last orphan node read is passed and returned here
551 * @outofdate: whether the LEB is out of date is returned here 551 * @outofdate: whether the LEB is out of date is returned here
552 * @last_flagged: whether the end orph node is encountered 552 * @last_flagged: whether the end orphan node is encountered
553 * 553 *
554 * This function is a helper to the 'kill_orphans()' function. It goes through 554 * This function is a helper to the 'kill_orphans()' function. It goes through
555 * every orphan node in a LEB and for every inode number recorded, removes 555 * every orphan node in a LEB and for every inode number recorded, removes
@@ -580,8 +580,8 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
580 /* 580 /*
581 * The commit number on the master node may be less, because 581 * The commit number on the master node may be less, because
582 * of a failed commit. If there are several failed commits in a 582 * of a failed commit. If there are several failed commits in a
583 * row, the commit number written on orph nodes will continue to 583 * row, the commit number written on orphan nodes will continue
584 * increase (because the commit number is adjusted here) even 584 * to increase (because the commit number is adjusted here) even
585 * though the commit number on the master node stays the same 585 * though the commit number on the master node stays the same
586 * because the master node has not been re-written. 586 * because the master node has not been re-written.
587 */ 587 */
@@ -589,9 +589,9 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
589 c->cmt_no = cmt_no; 589 c->cmt_no = cmt_no;
590 if (cmt_no < *last_cmt_no && *last_flagged) { 590 if (cmt_no < *last_cmt_no && *last_flagged) {
591 /* 591 /*
592 * The last orph node had a higher commit number and was 592 * The last orphan node had a higher commit number and
593 * flagged as the last written for that commit number. 593 * was flagged as the last written for that commit
594 * That makes this orph node, out of date. 594 * number. That makes this orphan node, out of date.
595 */ 595 */
596 if (!first) { 596 if (!first) {
597 ubifs_err("out of order commit number %llu in " 597 ubifs_err("out of order commit number %llu in "
@@ -658,10 +658,10 @@ static int kill_orphans(struct ubifs_info *c)
658 /* 658 /*
659 * Orph nodes always start at c->orph_first and are written to each 659 * Orph nodes always start at c->orph_first and are written to each
660 * successive LEB in turn. Generally unused LEBs will have been unmapped 660 * successive LEB in turn. Generally unused LEBs will have been unmapped
661 * but may contain out of date orph nodes if the unmap didn't go 661 * but may contain out of date orphan nodes if the unmap didn't go
662 * through. In addition, the last orph node written for each commit is 662 * through. In addition, the last orphan node written for each commit is
663 * marked (top bit of orph->cmt_no is set to 1). It is possible that 663 * marked (top bit of orph->cmt_no is set to 1). It is possible that
664 * there are orph nodes from the next commit (i.e. the commit did not 664 * there are orphan nodes from the next commit (i.e. the commit did not
665 * complete successfully). In that case, no orphans will have been lost 665 * complete successfully). In that case, no orphans will have been lost
666 * due to the way that orphans are written, and any orphans added will 666 * due to the way that orphans are written, and any orphans added will
667 * be valid orphans anyway and so can be deleted. 667 * be valid orphans anyway and so can be deleted.
@@ -718,7 +718,7 @@ int ubifs_mount_orphans(struct ubifs_info *c, int unclean, int read_only)
718 if (unclean) 718 if (unclean)
719 err = kill_orphans(c); 719 err = kill_orphans(c);
720 else if (!read_only) 720 else if (!read_only)
721 err = clear_orphans(c); 721 err = ubifs_clear_orphans(c);
722 722
723 return err; 723 return err;
724} 724}
@@ -899,7 +899,7 @@ static int dbg_scan_orphans(struct ubifs_info *c, struct check_info *ci)
899 for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) { 899 for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) {
900 struct ubifs_scan_leb *sleb; 900 struct ubifs_scan_leb *sleb;
901 901
902 sleb = ubifs_scan(c, lnum, 0, c->dbg_buf); 902 sleb = ubifs_scan(c, lnum, 0, c->dbg->buf);
903 if (IS_ERR(sleb)) { 903 if (IS_ERR(sleb)) {
904 err = PTR_ERR(sleb); 904 err = PTR_ERR(sleb);
905 break; 905 break;
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index 21f7d047c306..ce42a7b0ca5a 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -144,7 +144,7 @@ static int set_bud_lprops(struct ubifs_info *c, struct replay_entry *r)
144 /* 144 /*
145 * If the replay order was perfect the dirty space would now be 145 * If the replay order was perfect the dirty space would now be
146 * zero. The order is not perfect because the the journal heads 146 * zero. The order is not perfect because the the journal heads
147 * race with eachother. This is not a problem but is does mean 147 * race with each other. This is not a problem but is does mean
148 * that the dirty space may temporarily exceed c->leb_size 148 * that the dirty space may temporarily exceed c->leb_size
149 * during the replay. 149 * during the replay.
150 */ 150 */
@@ -656,7 +656,7 @@ out_dump:
656 * @dirty: amount of dirty space from padding and deletion nodes 656 * @dirty: amount of dirty space from padding and deletion nodes
657 * 657 *
658 * This function inserts a reference node to the replay tree and returns zero 658 * This function inserts a reference node to the replay tree and returns zero
659 * in case of success ort a negative error code in case of failure. 659 * in case of success or a negative error code in case of failure.
660 */ 660 */
661static int insert_ref_node(struct ubifs_info *c, int lnum, int offs, 661static int insert_ref_node(struct ubifs_info *c, int lnum, int offs,
662 unsigned long long sqnum, int free, int dirty) 662 unsigned long long sqnum, int free, int dirty)
@@ -883,7 +883,7 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf)
883 * This means that we reached end of log and now 883 * This means that we reached end of log and now
884 * look to the older log data, which was already 884 * look to the older log data, which was already
885 * committed but the eraseblock was not erased (UBIFS 885 * committed but the eraseblock was not erased (UBIFS
886 * only unmaps it). So this basically means we have to 886 * only un-maps it). So this basically means we have to
887 * exit with "end of log" code. 887 * exit with "end of log" code.
888 */ 888 */
889 err = 1; 889 err = 1;
@@ -1062,6 +1062,15 @@ int ubifs_replay_journal(struct ubifs_info *c)
1062 if (err) 1062 if (err)
1063 goto out; 1063 goto out;
1064 1064
1065 /*
1066 * UBIFS budgeting calculations use @c->budg_uncommitted_idx variable
1067 * to roughly estimate index growth. Things like @c->min_idx_lebs
1068 * depend on it. This means we have to initialize it to make sure
1069 * budgeting works properly.
1070 */
1071 c->budg_uncommitted_idx = atomic_long_read(&c->dirty_zn_cnt);
1072 c->budg_uncommitted_idx *= c->max_idx_node_sz;
1073
1065 ubifs_assert(c->bud_bytes <= c->max_bud_bytes || c->need_recovery); 1074 ubifs_assert(c->bud_bytes <= c->max_bud_bytes || c->need_recovery);
1066 dbg_mnt("finished, log head LEB %d:%d, max_sqnum %llu, " 1075 dbg_mnt("finished, log head LEB %d:%d, max_sqnum %llu, "
1067 "highest_inum %lu", c->lhead_lnum, c->lhead_offs, c->max_sqnum, 1076 "highest_inum %lu", c->lhead_lnum, c->lhead_offs, c->max_sqnum,
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index 0f392351dc5a..e070c643d1bb 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -28,6 +28,7 @@
28 28
29#include "ubifs.h" 29#include "ubifs.h"
30#include <linux/random.h> 30#include <linux/random.h>
31#include <linux/math64.h>
31 32
32/* 33/*
33 * Default journal size in logical eraseblocks as a percent of total 34 * Default journal size in logical eraseblocks as a percent of total
@@ -80,7 +81,7 @@ static int create_default_filesystem(struct ubifs_info *c)
80 int err, tmp, jnl_lebs, log_lebs, max_buds, main_lebs, main_first; 81 int err, tmp, jnl_lebs, log_lebs, max_buds, main_lebs, main_first;
81 int lpt_lebs, lpt_first, orph_lebs, big_lpt, ino_waste, sup_flags = 0; 82 int lpt_lebs, lpt_first, orph_lebs, big_lpt, ino_waste, sup_flags = 0;
82 int min_leb_cnt = UBIFS_MIN_LEB_CNT; 83 int min_leb_cnt = UBIFS_MIN_LEB_CNT;
83 uint64_t tmp64, main_bytes; 84 long long tmp64, main_bytes;
84 __le64 tmp_le64; 85 __le64 tmp_le64;
85 86
86 /* Some functions called from here depend on the @c->key_len filed */ 87 /* Some functions called from here depend on the @c->key_len filed */
@@ -160,7 +161,7 @@ static int create_default_filesystem(struct ubifs_info *c)
160 if (!sup) 161 if (!sup)
161 return -ENOMEM; 162 return -ENOMEM;
162 163
163 tmp64 = (uint64_t)max_buds * c->leb_size; 164 tmp64 = (long long)max_buds * c->leb_size;
164 if (big_lpt) 165 if (big_lpt)
165 sup_flags |= UBIFS_FLG_BIGLPT; 166 sup_flags |= UBIFS_FLG_BIGLPT;
166 167
@@ -179,14 +180,16 @@ static int create_default_filesystem(struct ubifs_info *c)
179 sup->fanout = cpu_to_le32(DEFAULT_FANOUT); 180 sup->fanout = cpu_to_le32(DEFAULT_FANOUT);
180 sup->lsave_cnt = cpu_to_le32(c->lsave_cnt); 181 sup->lsave_cnt = cpu_to_le32(c->lsave_cnt);
181 sup->fmt_version = cpu_to_le32(UBIFS_FORMAT_VERSION); 182 sup->fmt_version = cpu_to_le32(UBIFS_FORMAT_VERSION);
182 sup->default_compr = cpu_to_le16(UBIFS_COMPR_LZO);
183 sup->time_gran = cpu_to_le32(DEFAULT_TIME_GRAN); 183 sup->time_gran = cpu_to_le32(DEFAULT_TIME_GRAN);
184 if (c->mount_opts.override_compr)
185 sup->default_compr = cpu_to_le16(c->mount_opts.compr_type);
186 else
187 sup->default_compr = cpu_to_le16(UBIFS_COMPR_LZO);
184 188
185 generate_random_uuid(sup->uuid); 189 generate_random_uuid(sup->uuid);
186 190
187 main_bytes = (uint64_t)main_lebs * c->leb_size; 191 main_bytes = (long long)main_lebs * c->leb_size;
188 tmp64 = main_bytes * DEFAULT_RP_PERCENT; 192 tmp64 = div_u64(main_bytes * DEFAULT_RP_PERCENT, 100);
189 do_div(tmp64, 100);
190 if (tmp64 > DEFAULT_MAX_RP_SIZE) 193 if (tmp64 > DEFAULT_MAX_RP_SIZE)
191 tmp64 = DEFAULT_MAX_RP_SIZE; 194 tmp64 = DEFAULT_MAX_RP_SIZE;
192 sup->rp_size = cpu_to_le64(tmp64); 195 sup->rp_size = cpu_to_le64(tmp64);
@@ -582,16 +585,15 @@ int ubifs_read_superblock(struct ubifs_info *c)
582 c->jhead_cnt = le32_to_cpu(sup->jhead_cnt) + NONDATA_JHEADS_CNT; 585 c->jhead_cnt = le32_to_cpu(sup->jhead_cnt) + NONDATA_JHEADS_CNT;
583 c->fanout = le32_to_cpu(sup->fanout); 586 c->fanout = le32_to_cpu(sup->fanout);
584 c->lsave_cnt = le32_to_cpu(sup->lsave_cnt); 587 c->lsave_cnt = le32_to_cpu(sup->lsave_cnt);
585 c->default_compr = le16_to_cpu(sup->default_compr);
586 c->rp_size = le64_to_cpu(sup->rp_size); 588 c->rp_size = le64_to_cpu(sup->rp_size);
587 c->rp_uid = le32_to_cpu(sup->rp_uid); 589 c->rp_uid = le32_to_cpu(sup->rp_uid);
588 c->rp_gid = le32_to_cpu(sup->rp_gid); 590 c->rp_gid = le32_to_cpu(sup->rp_gid);
589 sup_flags = le32_to_cpu(sup->flags); 591 sup_flags = le32_to_cpu(sup->flags);
592 if (!c->mount_opts.override_compr)
593 c->default_compr = le16_to_cpu(sup->default_compr);
590 594
591 c->vfs_sb->s_time_gran = le32_to_cpu(sup->time_gran); 595 c->vfs_sb->s_time_gran = le32_to_cpu(sup->time_gran);
592
593 memcpy(&c->uuid, &sup->uuid, 16); 596 memcpy(&c->uuid, &sup->uuid, 16);
594
595 c->big_lpt = !!(sup_flags & UBIFS_FLG_BIGLPT); 597 c->big_lpt = !!(sup_flags & UBIFS_FLG_BIGLPT);
596 598
597 /* Automatically increase file system size to the maximum size */ 599 /* Automatically increase file system size to the maximum size */
diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c
index f248533841a2..e7bab52a1410 100644
--- a/fs/ubifs/shrinker.c
+++ b/fs/ubifs/shrinker.c
@@ -151,7 +151,7 @@ static int shrink_tnc(struct ubifs_info *c, int nr, int age, int *contention)
151 * @contention: if any contention, this is set to %1 151 * @contention: if any contention, this is set to %1
152 * 152 *
153 * This function walks the list of mounted UBIFS file-systems and frees clean 153 * This function walks the list of mounted UBIFS file-systems and frees clean
154 * znodes which are older then @age, until at least @nr znodes are freed. 154 * znodes which are older than @age, until at least @nr znodes are freed.
155 * Returns the number of freed znodes. 155 * Returns the number of freed znodes.
156 */ 156 */
157static int shrink_tnc_trees(int nr, int age, int *contention) 157static int shrink_tnc_trees(int nr, int age, int *contention)
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index d80b2aef42b6..1182b66a5491 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -34,6 +34,8 @@
34#include <linux/parser.h> 34#include <linux/parser.h>
35#include <linux/seq_file.h> 35#include <linux/seq_file.h>
36#include <linux/mount.h> 36#include <linux/mount.h>
37#include <linux/math64.h>
38#include <linux/writeback.h>
37#include "ubifs.h" 39#include "ubifs.h"
38 40
39/* 41/*
@@ -395,6 +397,7 @@ static int ubifs_statfs(struct dentry *dentry, struct kstatfs *buf)
395 buf->f_namelen = UBIFS_MAX_NLEN; 397 buf->f_namelen = UBIFS_MAX_NLEN;
396 buf->f_fsid.val[0] = le32_to_cpu(uuid[0]) ^ le32_to_cpu(uuid[2]); 398 buf->f_fsid.val[0] = le32_to_cpu(uuid[0]) ^ le32_to_cpu(uuid[2]);
397 buf->f_fsid.val[1] = le32_to_cpu(uuid[1]) ^ le32_to_cpu(uuid[3]); 399 buf->f_fsid.val[1] = le32_to_cpu(uuid[1]) ^ le32_to_cpu(uuid[3]);
400 ubifs_assert(buf->f_bfree <= c->block_cnt);
398 return 0; 401 return 0;
399} 402}
400 403
@@ -417,39 +420,62 @@ static int ubifs_show_options(struct seq_file *s, struct vfsmount *mnt)
417 else if (c->mount_opts.chk_data_crc == 1) 420 else if (c->mount_opts.chk_data_crc == 1)
418 seq_printf(s, ",no_chk_data_crc"); 421 seq_printf(s, ",no_chk_data_crc");
419 422
423 if (c->mount_opts.override_compr) {
424 seq_printf(s, ",compr=");
425 seq_printf(s, ubifs_compr_name(c->mount_opts.compr_type));
426 }
427
420 return 0; 428 return 0;
421} 429}
422 430
423static int ubifs_sync_fs(struct super_block *sb, int wait) 431static int ubifs_sync_fs(struct super_block *sb, int wait)
424{ 432{
433 int i, err;
425 struct ubifs_info *c = sb->s_fs_info; 434 struct ubifs_info *c = sb->s_fs_info;
426 int i, ret = 0, err; 435 struct writeback_control wbc = {
427 long long bud_bytes; 436 .sync_mode = WB_SYNC_ALL,
437 .range_start = 0,
438 .range_end = LLONG_MAX,
439 .nr_to_write = LONG_MAX,
440 };
428 441
429 if (c->jheads) { 442 /*
430 for (i = 0; i < c->jhead_cnt; i++) { 443 * Zero @wait is just an advisory thing to help the file system shove
431 err = ubifs_wbuf_sync(&c->jheads[i].wbuf); 444 * lots of data into the queues, and there will be the second
432 if (err && !ret) 445 * '->sync_fs()' call, with non-zero @wait.
433 ret = err; 446 */
434 } 447 if (!wait)
448 return 0;
435 449
436 /* Commit the journal unless it has too little data */ 450 if (sb->s_flags & MS_RDONLY)
437 spin_lock(&c->buds_lock); 451 return 0;
438 bud_bytes = c->bud_bytes; 452
439 spin_unlock(&c->buds_lock); 453 /*
440 if (bud_bytes > c->leb_size) { 454 * VFS calls '->sync_fs()' before synchronizing all dirty inodes and
441 err = ubifs_run_commit(c); 455 * pages, so synchronize them first, then commit the journal. Strictly
442 if (err) 456 * speaking, it is not necessary to commit the journal here,
443 return err; 457 * synchronizing write-buffers would be enough. But committing makes
444 } 458 * UBIFS free space predictions much more accurate, so we want to let
445 } 459 * the user be able to get more accurate results of 'statfs()' after
460 * they synchronize the file system.
461 */
462 generic_sync_sb_inodes(sb, &wbc);
446 463
447 /* 464 /*
448 * We ought to call sync for c->ubi but it does not have one. If it had 465 * Synchronize write buffers, because 'ubifs_run_commit()' does not
449 * it would in turn call mtd->sync, however mtd operations are 466 * do this if it waits for an already running commit.
450 * synchronous anyway, so we don't lose any sleep here.
451 */ 467 */
452 return ret; 468 for (i = 0; i < c->jhead_cnt; i++) {
469 err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
470 if (err)
471 return err;
472 }
473
474 err = ubifs_run_commit(c);
475 if (err)
476 return err;
477
478 return ubi_sync(c->vi.ubi_num);
453} 479}
454 480
455/** 481/**
@@ -548,15 +574,8 @@ static int init_constants_early(struct ubifs_info *c)
548 c->ranges[UBIFS_IDX_NODE].max_len = INT_MAX; 574 c->ranges[UBIFS_IDX_NODE].max_len = INT_MAX;
549 575
550 /* 576 /*
551 * Initialize dead and dark LEB space watermarks. 577 * Initialize dead and dark LEB space watermarks. See gc.c for comments
552 * 578 * about these values.
553 * Dead space is the space which cannot be used. Its watermark is
554 * equivalent to min. I/O unit or minimum node size if it is greater
555 * then min. I/O unit.
556 *
557 * Dark space is the space which might be used, or might not, depending
558 * on which node should be written to the LEB. Its watermark is
559 * equivalent to maximum UBIFS node size.
560 */ 579 */
561 c->dead_wm = ALIGN(MIN_WRITE_SZ, c->min_io_size); 580 c->dead_wm = ALIGN(MIN_WRITE_SZ, c->min_io_size);
562 c->dark_wm = ALIGN(UBIFS_MAX_NODE_SZ, c->min_io_size); 581 c->dark_wm = ALIGN(UBIFS_MAX_NODE_SZ, c->min_io_size);
@@ -596,7 +615,7 @@ static int bud_wbuf_callback(struct ubifs_info *c, int lnum, int free, int pad)
596} 615}
597 616
598/* 617/*
599 * init_constants_late - initialize UBIFS constants. 618 * init_constants_sb - initialize UBIFS constants.
600 * @c: UBIFS file-system description object 619 * @c: UBIFS file-system description object
601 * 620 *
602 * This is a helper function which initializes various UBIFS constants after 621 * This is a helper function which initializes various UBIFS constants after
@@ -604,10 +623,10 @@ static int bud_wbuf_callback(struct ubifs_info *c, int lnum, int free, int pad)
604 * makes sure they are all right. Returns zero in case of success and a 623 * makes sure they are all right. Returns zero in case of success and a
605 * negative error code in case of failure. 624 * negative error code in case of failure.
606 */ 625 */
607static int init_constants_late(struct ubifs_info *c) 626static int init_constants_sb(struct ubifs_info *c)
608{ 627{
609 int tmp, err; 628 int tmp, err;
610 uint64_t tmp64; 629 long long tmp64;
611 630
612 c->main_bytes = (long long)c->main_lebs * c->leb_size; 631 c->main_bytes = (long long)c->main_lebs * c->leb_size;
613 c->max_znode_sz = sizeof(struct ubifs_znode) + 632 c->max_znode_sz = sizeof(struct ubifs_znode) +
@@ -634,9 +653,8 @@ static int init_constants_late(struct ubifs_info *c)
634 * Make sure that the log is large enough to fit reference nodes for 653 * Make sure that the log is large enough to fit reference nodes for
635 * all buds plus one reserved LEB. 654 * all buds plus one reserved LEB.
636 */ 655 */
637 tmp64 = c->max_bud_bytes; 656 tmp64 = c->max_bud_bytes + c->leb_size - 1;
638 tmp = do_div(tmp64, c->leb_size); 657 c->max_bud_cnt = div_u64(tmp64, c->leb_size);
639 c->max_bud_cnt = tmp64 + !!tmp;
640 tmp = (c->ref_node_alsz * c->max_bud_cnt + c->leb_size - 1); 658 tmp = (c->ref_node_alsz * c->max_bud_cnt + c->leb_size - 1);
641 tmp /= c->leb_size; 659 tmp /= c->leb_size;
642 tmp += 1; 660 tmp += 1;
@@ -672,7 +690,7 @@ static int init_constants_late(struct ubifs_info *c)
672 * Consequently, if the journal is too small, UBIFS will treat it as 690 * Consequently, if the journal is too small, UBIFS will treat it as
673 * always full. 691 * always full.
674 */ 692 */
675 tmp64 = (uint64_t)(c->jhead_cnt + 1) * c->leb_size + 1; 693 tmp64 = (long long)(c->jhead_cnt + 1) * c->leb_size + 1;
676 if (c->bg_bud_bytes < tmp64) 694 if (c->bg_bud_bytes < tmp64)
677 c->bg_bud_bytes = tmp64; 695 c->bg_bud_bytes = tmp64;
678 if (c->max_bud_bytes < tmp64 + c->leb_size) 696 if (c->max_bud_bytes < tmp64 + c->leb_size)
@@ -682,6 +700,21 @@ static int init_constants_late(struct ubifs_info *c)
682 if (err) 700 if (err)
683 return err; 701 return err;
684 702
703 return 0;
704}
705
706/*
707 * init_constants_master - initialize UBIFS constants.
708 * @c: UBIFS file-system description object
709 *
710 * This is a helper function which initializes various UBIFS constants after
711 * the master node has been read. It also checks various UBIFS parameters and
712 * makes sure they are all right.
713 */
714static void init_constants_master(struct ubifs_info *c)
715{
716 long long tmp64;
717
685 c->min_idx_lebs = ubifs_calc_min_idx_lebs(c); 718 c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
686 719
687 /* 720 /*
@@ -690,26 +723,25 @@ static int init_constants_late(struct ubifs_info *c)
690 * necessary to report something for the 'statfs()' call. 723 * necessary to report something for the 'statfs()' call.
691 * 724 *
692 * Subtract the LEB reserved for GC, the LEB which is reserved for 725 * Subtract the LEB reserved for GC, the LEB which is reserved for
693 * deletions, and assume only one journal head is available. 726 * deletions, minimum LEBs for the index, and assume only one journal
727 * head is available.
694 */ 728 */
695 tmp64 = c->main_lebs - 2 - c->jhead_cnt + 1; 729 tmp64 = c->main_lebs - 1 - 1 - MIN_INDEX_LEBS - c->jhead_cnt + 1;
696 tmp64 *= (uint64_t)c->leb_size - c->leb_overhead; 730 tmp64 *= (long long)c->leb_size - c->leb_overhead;
697 tmp64 = ubifs_reported_space(c, tmp64); 731 tmp64 = ubifs_reported_space(c, tmp64);
698 c->block_cnt = tmp64 >> UBIFS_BLOCK_SHIFT; 732 c->block_cnt = tmp64 >> UBIFS_BLOCK_SHIFT;
699
700 return 0;
701} 733}
702 734
703/** 735/**
704 * take_gc_lnum - reserve GC LEB. 736 * take_gc_lnum - reserve GC LEB.
705 * @c: UBIFS file-system description object 737 * @c: UBIFS file-system description object
706 * 738 *
707 * This function ensures that the LEB reserved for garbage collection is 739 * This function ensures that the LEB reserved for garbage collection is marked
708 * unmapped and is marked as "taken" in lprops. We also have to set free space 740 * as "taken" in lprops. We also have to set free space to LEB size and dirty
709 * to LEB size and dirty space to zero, because lprops may contain out-of-date 741 * space to zero, because lprops may contain out-of-date information if the
710 * information if the file-system was un-mounted before it has been committed. 742 * file-system was un-mounted before it has been committed. This function
711 * This function returns zero in case of success and a negative error code in 743 * returns zero in case of success and a negative error code in case of
712 * case of failure. 744 * failure.
713 */ 745 */
714static int take_gc_lnum(struct ubifs_info *c) 746static int take_gc_lnum(struct ubifs_info *c)
715{ 747{
@@ -720,10 +752,6 @@ static int take_gc_lnum(struct ubifs_info *c)
720 return -EINVAL; 752 return -EINVAL;
721 } 753 }
722 754
723 err = ubifs_leb_unmap(c, c->gc_lnum);
724 if (err)
725 return err;
726
727 /* And we have to tell lprops that this LEB is taken */ 755 /* And we have to tell lprops that this LEB is taken */
728 err = ubifs_change_one_lp(c, c->gc_lnum, c->leb_size, 0, 756 err = ubifs_change_one_lp(c, c->gc_lnum, c->leb_size, 0,
729 LPROPS_TAKEN, 0, 0); 757 LPROPS_TAKEN, 0, 0);
@@ -878,6 +906,7 @@ static int check_volume_empty(struct ubifs_info *c)
878 * Opt_no_bulk_read: disable bulk-reads 906 * Opt_no_bulk_read: disable bulk-reads
879 * Opt_chk_data_crc: check CRCs when reading data nodes 907 * Opt_chk_data_crc: check CRCs when reading data nodes
880 * Opt_no_chk_data_crc: do not check CRCs when reading data nodes 908 * Opt_no_chk_data_crc: do not check CRCs when reading data nodes
909 * Opt_override_compr: override default compressor
881 * Opt_err: just end of array marker 910 * Opt_err: just end of array marker
882 */ 911 */
883enum { 912enum {
@@ -887,6 +916,7 @@ enum {
887 Opt_no_bulk_read, 916 Opt_no_bulk_read,
888 Opt_chk_data_crc, 917 Opt_chk_data_crc,
889 Opt_no_chk_data_crc, 918 Opt_no_chk_data_crc,
919 Opt_override_compr,
890 Opt_err, 920 Opt_err,
891}; 921};
892 922
@@ -897,6 +927,7 @@ static const match_table_t tokens = {
897 {Opt_no_bulk_read, "no_bulk_read"}, 927 {Opt_no_bulk_read, "no_bulk_read"},
898 {Opt_chk_data_crc, "chk_data_crc"}, 928 {Opt_chk_data_crc, "chk_data_crc"},
899 {Opt_no_chk_data_crc, "no_chk_data_crc"}, 929 {Opt_no_chk_data_crc, "no_chk_data_crc"},
930 {Opt_override_compr, "compr=%s"},
900 {Opt_err, NULL}, 931 {Opt_err, NULL},
901}; 932};
902 933
@@ -926,13 +957,16 @@ static int ubifs_parse_options(struct ubifs_info *c, char *options,
926 957
927 token = match_token(p, tokens, args); 958 token = match_token(p, tokens, args);
928 switch (token) { 959 switch (token) {
960 /*
961 * %Opt_fast_unmount and %Opt_norm_unmount options are ignored.
962 * We accepte them in order to be backware-compatible. But this
963 * should be removed at some point.
964 */
929 case Opt_fast_unmount: 965 case Opt_fast_unmount:
930 c->mount_opts.unmount_mode = 2; 966 c->mount_opts.unmount_mode = 2;
931 c->fast_unmount = 1;
932 break; 967 break;
933 case Opt_norm_unmount: 968 case Opt_norm_unmount:
934 c->mount_opts.unmount_mode = 1; 969 c->mount_opts.unmount_mode = 1;
935 c->fast_unmount = 0;
936 break; 970 break;
937 case Opt_bulk_read: 971 case Opt_bulk_read:
938 c->mount_opts.bulk_read = 2; 972 c->mount_opts.bulk_read = 2;
@@ -950,6 +984,28 @@ static int ubifs_parse_options(struct ubifs_info *c, char *options,
950 c->mount_opts.chk_data_crc = 1; 984 c->mount_opts.chk_data_crc = 1;
951 c->no_chk_data_crc = 1; 985 c->no_chk_data_crc = 1;
952 break; 986 break;
987 case Opt_override_compr:
988 {
989 char *name = match_strdup(&args[0]);
990
991 if (!name)
992 return -ENOMEM;
993 if (!strcmp(name, "none"))
994 c->mount_opts.compr_type = UBIFS_COMPR_NONE;
995 else if (!strcmp(name, "lzo"))
996 c->mount_opts.compr_type = UBIFS_COMPR_LZO;
997 else if (!strcmp(name, "zlib"))
998 c->mount_opts.compr_type = UBIFS_COMPR_ZLIB;
999 else {
1000 ubifs_err("unknown compressor \"%s\"", name);
1001 kfree(name);
1002 return -EINVAL;
1003 }
1004 kfree(name);
1005 c->mount_opts.override_compr = 1;
1006 c->default_compr = c->mount_opts.compr_type;
1007 break;
1008 }
953 default: 1009 default:
954 ubifs_err("unrecognized mount option \"%s\" " 1010 ubifs_err("unrecognized mount option \"%s\" "
955 "or missing value", p); 1011 "or missing value", p);
@@ -1019,6 +1075,25 @@ again:
1019} 1075}
1020 1076
1021/** 1077/**
1078 * check_free_space - check if there is enough free space to mount.
1079 * @c: UBIFS file-system description object
1080 *
1081 * This function makes sure UBIFS has enough free space to be mounted in
1082 * read/write mode. UBIFS must always have some free space to allow deletions.
1083 */
1084static int check_free_space(struct ubifs_info *c)
1085{
1086 ubifs_assert(c->dark_wm > 0);
1087 if (c->lst.total_free + c->lst.total_dirty < c->dark_wm) {
1088 ubifs_err("insufficient free space to mount in read/write mode");
1089 dbg_dump_budg(c);
1090 dbg_dump_lprops(c);
1091 return -ENOSPC;
1092 }
1093 return 0;
1094}
1095
1096/**
1022 * mount_ubifs - mount UBIFS file-system. 1097 * mount_ubifs - mount UBIFS file-system.
1023 * @c: UBIFS file-system description object 1098 * @c: UBIFS file-system description object
1024 * 1099 *
@@ -1039,11 +1114,9 @@ static int mount_ubifs(struct ubifs_info *c)
1039 if (err) 1114 if (err)
1040 return err; 1115 return err;
1041 1116
1042#ifdef CONFIG_UBIFS_FS_DEBUG 1117 err = ubifs_debugging_init(c);
1043 c->dbg_buf = vmalloc(c->leb_size); 1118 if (err)
1044 if (!c->dbg_buf) 1119 return err;
1045 return -ENOMEM;
1046#endif
1047 1120
1048 err = check_volume_empty(c); 1121 err = check_volume_empty(c);
1049 if (err) 1122 if (err)
@@ -1100,27 +1173,25 @@ static int mount_ubifs(struct ubifs_info *c)
1100 goto out_free; 1173 goto out_free;
1101 1174
1102 /* 1175 /*
1103 * Make sure the compressor which is set as the default on in the 1176 * Make sure the compressor which is set as default in the superblock
1104 * superblock was actually compiled in. 1177 * or overridden by mount options is actually compiled in.
1105 */ 1178 */
1106 if (!ubifs_compr_present(c->default_compr)) { 1179 if (!ubifs_compr_present(c->default_compr)) {
1107 ubifs_warn("'%s' compressor is set by superblock, but not " 1180 ubifs_err("'compressor \"%s\" is not compiled in",
1108 "compiled in", ubifs_compr_name(c->default_compr)); 1181 ubifs_compr_name(c->default_compr));
1109 c->default_compr = UBIFS_COMPR_NONE; 1182 goto out_free;
1110 } 1183 }
1111 1184
1112 dbg_failure_mode_registration(c); 1185 err = init_constants_sb(c);
1113
1114 err = init_constants_late(c);
1115 if (err) 1186 if (err)
1116 goto out_dereg; 1187 goto out_free;
1117 1188
1118 sz = ALIGN(c->max_idx_node_sz, c->min_io_size); 1189 sz = ALIGN(c->max_idx_node_sz, c->min_io_size);
1119 sz = ALIGN(sz + c->max_idx_node_sz, c->min_io_size); 1190 sz = ALIGN(sz + c->max_idx_node_sz, c->min_io_size);
1120 c->cbuf = kmalloc(sz, GFP_NOFS); 1191 c->cbuf = kmalloc(sz, GFP_NOFS);
1121 if (!c->cbuf) { 1192 if (!c->cbuf) {
1122 err = -ENOMEM; 1193 err = -ENOMEM;
1123 goto out_dereg; 1194 goto out_free;
1124 } 1195 }
1125 1196
1126 sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num, c->vi.vol_id); 1197 sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num, c->vi.vol_id);
@@ -1145,6 +1216,8 @@ static int mount_ubifs(struct ubifs_info *c)
1145 if (err) 1216 if (err)
1146 goto out_master; 1217 goto out_master;
1147 1218
1219 init_constants_master(c);
1220
1148 if ((c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY)) != 0) { 1221 if ((c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY)) != 0) {
1149 ubifs_msg("recovery needed"); 1222 ubifs_msg("recovery needed");
1150 c->need_recovery = 1; 1223 c->need_recovery = 1;
@@ -1183,12 +1256,9 @@ static int mount_ubifs(struct ubifs_info *c)
1183 if (!mounted_read_only) { 1256 if (!mounted_read_only) {
1184 int lnum; 1257 int lnum;
1185 1258
1186 /* Check for enough free space */ 1259 err = check_free_space(c);
1187 if (ubifs_calc_available(c, c->min_idx_lebs) <= 0) { 1260 if (err)
1188 ubifs_err("insufficient available space");
1189 err = -EINVAL;
1190 goto out_orphans; 1261 goto out_orphans;
1191 }
1192 1262
1193 /* Check for enough log space */ 1263 /* Check for enough log space */
1194 lnum = c->lhead_lnum + 1; 1264 lnum = c->lhead_lnum + 1;
@@ -1205,10 +1275,19 @@ static int mount_ubifs(struct ubifs_info *c)
1205 if (err) 1275 if (err)
1206 goto out_orphans; 1276 goto out_orphans;
1207 err = ubifs_rcvry_gc_commit(c); 1277 err = ubifs_rcvry_gc_commit(c);
1208 } else 1278 } else {
1209 err = take_gc_lnum(c); 1279 err = take_gc_lnum(c);
1210 if (err) 1280 if (err)
1211 goto out_orphans; 1281 goto out_orphans;
1282
1283 /*
1284 * GC LEB may contain garbage if there was an unclean
1285 * reboot, and it should be un-mapped.
1286 */
1287 err = ubifs_leb_unmap(c, c->gc_lnum);
1288 if (err)
1289 return err;
1290 }
1212 1291
1213 err = dbg_check_lprops(c); 1292 err = dbg_check_lprops(c);
1214 if (err) 1293 if (err)
@@ -1217,6 +1296,16 @@ static int mount_ubifs(struct ubifs_info *c)
1217 err = ubifs_recover_size(c); 1296 err = ubifs_recover_size(c);
1218 if (err) 1297 if (err)
1219 goto out_orphans; 1298 goto out_orphans;
1299 } else {
1300 /*
1301 * Even if we mount read-only, we have to set space in GC LEB
1302 * to proper value because this affects UBIFS free space
1303 * reporting. We do not want to have a situation when
1304 * re-mounting from R/O to R/W changes amount of free space.
1305 */
1306 err = take_gc_lnum(c);
1307 if (err)
1308 goto out_orphans;
1220 } 1309 }
1221 1310
1222 spin_lock(&ubifs_infos_lock); 1311 spin_lock(&ubifs_infos_lock);
@@ -1229,13 +1318,20 @@ static int mount_ubifs(struct ubifs_info *c)
1229 else { 1318 else {
1230 c->need_recovery = 0; 1319 c->need_recovery = 0;
1231 ubifs_msg("recovery completed"); 1320 ubifs_msg("recovery completed");
1321 /* GC LEB has to be empty and taken at this point */
1322 ubifs_assert(c->lst.taken_empty_lebs == 1);
1232 } 1323 }
1233 } 1324 } else
1325 ubifs_assert(c->lst.taken_empty_lebs == 1);
1234 1326
1235 err = dbg_check_filesystem(c); 1327 err = dbg_check_filesystem(c);
1236 if (err) 1328 if (err)
1237 goto out_infos; 1329 goto out_infos;
1238 1330
1331 err = dbg_debugfs_init_fs(c);
1332 if (err)
1333 goto out_infos;
1334
1239 c->always_chk_crc = 0; 1335 c->always_chk_crc = 0;
1240 1336
1241 ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"", 1337 ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"",
@@ -1266,7 +1362,6 @@ static int mount_ubifs(struct ubifs_info *c)
1266 c->uuid[4], c->uuid[5], c->uuid[6], c->uuid[7], 1362 c->uuid[4], c->uuid[5], c->uuid[6], c->uuid[7],
1267 c->uuid[8], c->uuid[9], c->uuid[10], c->uuid[11], 1363 c->uuid[8], c->uuid[9], c->uuid[10], c->uuid[11],
1268 c->uuid[12], c->uuid[13], c->uuid[14], c->uuid[15]); 1364 c->uuid[12], c->uuid[13], c->uuid[14], c->uuid[15]);
1269 dbg_msg("fast unmount: %d", c->fast_unmount);
1270 dbg_msg("big_lpt %d", c->big_lpt); 1365 dbg_msg("big_lpt %d", c->big_lpt);
1271 dbg_msg("log LEBs: %d (%d - %d)", 1366 dbg_msg("log LEBs: %d (%d - %d)",
1272 c->log_lebs, UBIFS_LOG_LNUM, c->log_last); 1367 c->log_lebs, UBIFS_LOG_LNUM, c->log_last);
@@ -1283,8 +1378,20 @@ static int mount_ubifs(struct ubifs_info *c)
1283 dbg_msg("tree fanout: %d", c->fanout); 1378 dbg_msg("tree fanout: %d", c->fanout);
1284 dbg_msg("reserved GC LEB: %d", c->gc_lnum); 1379 dbg_msg("reserved GC LEB: %d", c->gc_lnum);
1285 dbg_msg("first main LEB: %d", c->main_first); 1380 dbg_msg("first main LEB: %d", c->main_first);
1381 dbg_msg("max. znode size %d", c->max_znode_sz);
1382 dbg_msg("max. index node size %d", c->max_idx_node_sz);
1383 dbg_msg("node sizes: data %zu, inode %zu, dentry %zu",
1384 UBIFS_DATA_NODE_SZ, UBIFS_INO_NODE_SZ, UBIFS_DENT_NODE_SZ);
1385 dbg_msg("node sizes: trun %zu, sb %zu, master %zu",
1386 UBIFS_TRUN_NODE_SZ, UBIFS_SB_NODE_SZ, UBIFS_MST_NODE_SZ);
1387 dbg_msg("node sizes: ref %zu, cmt. start %zu, orph %zu",
1388 UBIFS_REF_NODE_SZ, UBIFS_CS_NODE_SZ, UBIFS_ORPH_NODE_SZ);
1389 dbg_msg("max. node sizes: data %zu, inode %zu dentry %zu",
1390 UBIFS_MAX_DATA_NODE_SZ, UBIFS_MAX_INO_NODE_SZ,
1391 UBIFS_MAX_DENT_NODE_SZ);
1286 dbg_msg("dead watermark: %d", c->dead_wm); 1392 dbg_msg("dead watermark: %d", c->dead_wm);
1287 dbg_msg("dark watermark: %d", c->dark_wm); 1393 dbg_msg("dark watermark: %d", c->dark_wm);
1394 dbg_msg("LEB overhead: %d", c->leb_overhead);
1288 x = (long long)c->main_lebs * c->dark_wm; 1395 x = (long long)c->main_lebs * c->dark_wm;
1289 dbg_msg("max. dark space: %lld (%lld KiB, %lld MiB)", 1396 dbg_msg("max. dark space: %lld (%lld KiB, %lld MiB)",
1290 x, x >> 10, x >> 20); 1397 x, x >> 10, x >> 20);
@@ -1320,14 +1427,12 @@ out_wbufs:
1320 free_wbufs(c); 1427 free_wbufs(c);
1321out_cbuf: 1428out_cbuf:
1322 kfree(c->cbuf); 1429 kfree(c->cbuf);
1323out_dereg:
1324 dbg_failure_mode_deregistration(c);
1325out_free: 1430out_free:
1326 kfree(c->bu.buf); 1431 kfree(c->bu.buf);
1327 vfree(c->ileb_buf); 1432 vfree(c->ileb_buf);
1328 vfree(c->sbuf); 1433 vfree(c->sbuf);
1329 kfree(c->bottom_up_buf); 1434 kfree(c->bottom_up_buf);
1330 UBIFS_DBG(vfree(c->dbg_buf)); 1435 ubifs_debugging_exit(c);
1331 return err; 1436 return err;
1332} 1437}
1333 1438
@@ -1345,6 +1450,7 @@ static void ubifs_umount(struct ubifs_info *c)
1345 dbg_gen("un-mounting UBI device %d, volume %d", c->vi.ubi_num, 1450 dbg_gen("un-mounting UBI device %d, volume %d", c->vi.ubi_num,
1346 c->vi.vol_id); 1451 c->vi.vol_id);
1347 1452
1453 dbg_debugfs_exit_fs(c);
1348 spin_lock(&ubifs_infos_lock); 1454 spin_lock(&ubifs_infos_lock);
1349 list_del(&c->infos_list); 1455 list_del(&c->infos_list);
1350 spin_unlock(&ubifs_infos_lock); 1456 spin_unlock(&ubifs_infos_lock);
@@ -1364,8 +1470,7 @@ static void ubifs_umount(struct ubifs_info *c)
1364 vfree(c->ileb_buf); 1470 vfree(c->ileb_buf);
1365 vfree(c->sbuf); 1471 vfree(c->sbuf);
1366 kfree(c->bottom_up_buf); 1472 kfree(c->bottom_up_buf);
1367 UBIFS_DBG(vfree(c->dbg_buf)); 1473 ubifs_debugging_exit(c);
1368 dbg_failure_mode_deregistration(c);
1369} 1474}
1370 1475
1371/** 1476/**
@@ -1380,19 +1485,14 @@ static int ubifs_remount_rw(struct ubifs_info *c)
1380{ 1485{
1381 int err, lnum; 1486 int err, lnum;
1382 1487
1383 if (c->ro_media)
1384 return -EINVAL;
1385
1386 mutex_lock(&c->umount_mutex); 1488 mutex_lock(&c->umount_mutex);
1489 dbg_save_space_info(c);
1387 c->remounting_rw = 1; 1490 c->remounting_rw = 1;
1388 c->always_chk_crc = 1; 1491 c->always_chk_crc = 1;
1389 1492
1390 /* Check for enough free space */ 1493 err = check_free_space(c);
1391 if (ubifs_calc_available(c, c->min_idx_lebs) <= 0) { 1494 if (err)
1392 ubifs_err("insufficient available space");
1393 err = -EINVAL;
1394 goto out; 1495 goto out;
1395 }
1396 1496
1397 if (c->old_leb_cnt != c->leb_cnt) { 1497 if (c->old_leb_cnt != c->leb_cnt) {
1398 struct ubifs_sb_node *sup; 1498 struct ubifs_sb_node *sup;
@@ -1422,6 +1522,12 @@ static int ubifs_remount_rw(struct ubifs_info *c)
1422 err = ubifs_recover_inl_heads(c, c->sbuf); 1522 err = ubifs_recover_inl_heads(c, c->sbuf);
1423 if (err) 1523 if (err)
1424 goto out; 1524 goto out;
1525 } else {
1526 /* A readonly mount is not allowed to have orphans */
1527 ubifs_assert(c->tot_orphans == 0);
1528 err = ubifs_clear_orphans(c);
1529 if (err)
1530 goto out;
1425 } 1531 }
1426 1532
1427 if (!(c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY))) { 1533 if (!(c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY))) {
@@ -1477,7 +1583,7 @@ static int ubifs_remount_rw(struct ubifs_info *c)
1477 if (c->need_recovery) 1583 if (c->need_recovery)
1478 err = ubifs_rcvry_gc_commit(c); 1584 err = ubifs_rcvry_gc_commit(c);
1479 else 1585 else
1480 err = take_gc_lnum(c); 1586 err = ubifs_leb_unmap(c, c->gc_lnum);
1481 if (err) 1587 if (err)
1482 goto out; 1588 goto out;
1483 1589
@@ -1490,8 +1596,9 @@ static int ubifs_remount_rw(struct ubifs_info *c)
1490 c->vfs_sb->s_flags &= ~MS_RDONLY; 1596 c->vfs_sb->s_flags &= ~MS_RDONLY;
1491 c->remounting_rw = 0; 1597 c->remounting_rw = 0;
1492 c->always_chk_crc = 0; 1598 c->always_chk_crc = 0;
1599 err = dbg_check_space_info(c);
1493 mutex_unlock(&c->umount_mutex); 1600 mutex_unlock(&c->umount_mutex);
1494 return 0; 1601 return err;
1495 1602
1496out: 1603out:
1497 vfree(c->orph_buf); 1604 vfree(c->orph_buf);
@@ -1511,39 +1618,18 @@ out:
1511} 1618}
1512 1619
1513/** 1620/**
1514 * commit_on_unmount - commit the journal when un-mounting.
1515 * @c: UBIFS file-system description object
1516 *
1517 * This function is called during un-mounting and re-mounting, and it commits
1518 * the journal unless the "fast unmount" mode is enabled. It also avoids
1519 * committing the journal if it contains too few data.
1520 */
1521static void commit_on_unmount(struct ubifs_info *c)
1522{
1523 if (!c->fast_unmount) {
1524 long long bud_bytes;
1525
1526 spin_lock(&c->buds_lock);
1527 bud_bytes = c->bud_bytes;
1528 spin_unlock(&c->buds_lock);
1529 if (bud_bytes > c->leb_size)
1530 ubifs_run_commit(c);
1531 }
1532}
1533
1534/**
1535 * ubifs_remount_ro - re-mount in read-only mode. 1621 * ubifs_remount_ro - re-mount in read-only mode.
1536 * @c: UBIFS file-system description object 1622 * @c: UBIFS file-system description object
1537 * 1623 *
1538 * We rely on VFS to have stopped writing. Possibly the background thread could 1624 * We assume VFS has stopped writing. Possibly the background thread could be
1539 * be running a commit, however kthread_stop will wait in that case. 1625 * running a commit, however kthread_stop will wait in that case.
1540 */ 1626 */
1541static void ubifs_remount_ro(struct ubifs_info *c) 1627static void ubifs_remount_ro(struct ubifs_info *c)
1542{ 1628{
1543 int i, err; 1629 int i, err;
1544 1630
1545 ubifs_assert(!c->need_recovery); 1631 ubifs_assert(!c->need_recovery);
1546 commit_on_unmount(c); 1632 ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY));
1547 1633
1548 mutex_lock(&c->umount_mutex); 1634 mutex_lock(&c->umount_mutex);
1549 if (c->bgt) { 1635 if (c->bgt) {
@@ -1551,27 +1637,29 @@ static void ubifs_remount_ro(struct ubifs_info *c)
1551 c->bgt = NULL; 1637 c->bgt = NULL;
1552 } 1638 }
1553 1639
1640 dbg_save_space_info(c);
1641
1554 for (i = 0; i < c->jhead_cnt; i++) { 1642 for (i = 0; i < c->jhead_cnt; i++) {
1555 ubifs_wbuf_sync(&c->jheads[i].wbuf); 1643 ubifs_wbuf_sync(&c->jheads[i].wbuf);
1556 del_timer_sync(&c->jheads[i].wbuf.timer); 1644 del_timer_sync(&c->jheads[i].wbuf.timer);
1557 } 1645 }
1558 1646
1559 if (!c->ro_media) { 1647 c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY);
1560 c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY); 1648 c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS);
1561 c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS); 1649 c->mst_node->gc_lnum = cpu_to_le32(c->gc_lnum);
1562 c->mst_node->gc_lnum = cpu_to_le32(c->gc_lnum); 1650 err = ubifs_write_master(c);
1563 err = ubifs_write_master(c); 1651 if (err)
1564 if (err) 1652 ubifs_ro_mode(c, err);
1565 ubifs_ro_mode(c, err);
1566 }
1567 1653
1568 ubifs_destroy_idx_gc(c);
1569 free_wbufs(c); 1654 free_wbufs(c);
1570 vfree(c->orph_buf); 1655 vfree(c->orph_buf);
1571 c->orph_buf = NULL; 1656 c->orph_buf = NULL;
1572 vfree(c->ileb_buf); 1657 vfree(c->ileb_buf);
1573 c->ileb_buf = NULL; 1658 c->ileb_buf = NULL;
1574 ubifs_lpt_free(c, 1); 1659 ubifs_lpt_free(c, 1);
1660 err = dbg_check_space_info(c);
1661 if (err)
1662 ubifs_ro_mode(c, err);
1575 mutex_unlock(&c->umount_mutex); 1663 mutex_unlock(&c->umount_mutex);
1576} 1664}
1577 1665
@@ -1664,11 +1752,20 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
1664 } 1752 }
1665 1753
1666 if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) { 1754 if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
1755 if (c->ro_media) {
1756 ubifs_msg("cannot re-mount due to prior errors");
1757 return -EROFS;
1758 }
1667 err = ubifs_remount_rw(c); 1759 err = ubifs_remount_rw(c);
1668 if (err) 1760 if (err)
1669 return err; 1761 return err;
1670 } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) 1762 } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) {
1763 if (c->ro_media) {
1764 ubifs_msg("cannot re-mount due to prior errors");
1765 return -EROFS;
1766 }
1671 ubifs_remount_ro(c); 1767 ubifs_remount_ro(c);
1768 }
1672 1769
1673 if (c->bulk_read == 1) 1770 if (c->bulk_read == 1)
1674 bu_init(c); 1771 bu_init(c);
@@ -1678,10 +1775,11 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
1678 c->bu.buf = NULL; 1775 c->bu.buf = NULL;
1679 } 1776 }
1680 1777
1778 ubifs_assert(c->lst.taken_empty_lebs == 1);
1681 return 0; 1779 return 0;
1682} 1780}
1683 1781
1684struct super_operations ubifs_super_operations = { 1782const struct super_operations ubifs_super_operations = {
1685 .alloc_inode = ubifs_alloc_inode, 1783 .alloc_inode = ubifs_alloc_inode,
1686 .destroy_inode = ubifs_destroy_inode, 1784 .destroy_inode = ubifs_destroy_inode,
1687 .put_super = ubifs_put_super, 1785 .put_super = ubifs_put_super,
@@ -1849,7 +1947,6 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
1849 goto out_iput; 1947 goto out_iput;
1850 1948
1851 mutex_unlock(&c->umount_mutex); 1949 mutex_unlock(&c->umount_mutex);
1852
1853 return 0; 1950 return 0;
1854 1951
1855out_iput: 1952out_iput:
@@ -1949,15 +2046,6 @@ out_close:
1949 2046
1950static void ubifs_kill_sb(struct super_block *sb) 2047static void ubifs_kill_sb(struct super_block *sb)
1951{ 2048{
1952 struct ubifs_info *c = sb->s_fs_info;
1953
1954 /*
1955 * We do 'commit_on_unmount()' here instead of 'ubifs_put_super()'
1956 * in order to be outside BKL.
1957 */
1958 if (sb->s_root && !(sb->s_flags & MS_RDONLY))
1959 commit_on_unmount(c);
1960 /* The un-mount routine is actually done in put_super() */
1961 generic_shutdown_super(sb); 2049 generic_shutdown_super(sb);
1962} 2050}
1963 2051
@@ -2021,6 +2109,14 @@ static int __init ubifs_init(void)
2021 BUILD_BUG_ON(UBIFS_REF_NODE_SZ != 64); 2109 BUILD_BUG_ON(UBIFS_REF_NODE_SZ != 64);
2022 2110
2023 /* 2111 /*
2112 * We use 2 bit wide bit-fields to store compression type, which should
2113 * be amended if more compressors are added. The bit-fields are:
2114 * @compr_type in 'struct ubifs_inode', @default_compr in
2115 * 'struct ubifs_info' and @compr_type in 'struct ubifs_mount_opts'.
2116 */
2117 BUILD_BUG_ON(UBIFS_COMPR_TYPES_CNT > 4);
2118
2119 /*
2024 * We require that PAGE_CACHE_SIZE is greater-than-or-equal-to 2120 * We require that PAGE_CACHE_SIZE is greater-than-or-equal-to
2025 * UBIFS_BLOCK_SIZE. It is assumed that both are powers of 2. 2121 * UBIFS_BLOCK_SIZE. It is assumed that both are powers of 2.
2026 */ 2122 */
@@ -2049,11 +2145,17 @@ static int __init ubifs_init(void)
2049 2145
2050 err = ubifs_compressors_init(); 2146 err = ubifs_compressors_init();
2051 if (err) 2147 if (err)
2148 goto out_shrinker;
2149
2150 err = dbg_debugfs_init();
2151 if (err)
2052 goto out_compr; 2152 goto out_compr;
2053 2153
2054 return 0; 2154 return 0;
2055 2155
2056out_compr: 2156out_compr:
2157 ubifs_compressors_exit();
2158out_shrinker:
2057 unregister_shrinker(&ubifs_shrinker_info); 2159 unregister_shrinker(&ubifs_shrinker_info);
2058 kmem_cache_destroy(ubifs_inode_slab); 2160 kmem_cache_destroy(ubifs_inode_slab);
2059out_reg: 2161out_reg:
@@ -2068,6 +2170,7 @@ static void __exit ubifs_exit(void)
2068 ubifs_assert(list_empty(&ubifs_infos)); 2170 ubifs_assert(list_empty(&ubifs_infos));
2069 ubifs_assert(atomic_long_read(&ubifs_clean_zn_cnt) == 0); 2171 ubifs_assert(atomic_long_read(&ubifs_clean_zn_cnt) == 0);
2070 2172
2173 dbg_debugfs_exit();
2071 ubifs_compressors_exit(); 2174 ubifs_compressors_exit();
2072 unregister_shrinker(&ubifs_shrinker_info); 2175 unregister_shrinker(&ubifs_shrinker_info);
2073 kmem_cache_destroy(ubifs_inode_slab); 2176 kmem_cache_destroy(ubifs_inode_slab);
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index 6eef5344a145..fa28a84c6a1b 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -443,6 +443,11 @@ static int tnc_read_node_nm(struct ubifs_info *c, struct ubifs_zbranch *zbr,
443 * This function performs that same function as ubifs_read_node except that 443 * This function performs that same function as ubifs_read_node except that
444 * it does not require that there is actually a node present and instead 444 * it does not require that there is actually a node present and instead
445 * the return code indicates if a node was read. 445 * the return code indicates if a node was read.
446 *
447 * Note, this function does not check CRC of data nodes if @c->no_chk_data_crc
448 * is true (it is controlled by corresponding mount option). However, if
449 * @c->always_chk_crc is true, @c->no_chk_data_crc is ignored and CRC is always
450 * checked.
446 */ 451 */
447static int try_read_node(const struct ubifs_info *c, void *buf, int type, 452static int try_read_node(const struct ubifs_info *c, void *buf, int type,
448 int len, int lnum, int offs) 453 int len, int lnum, int offs)
@@ -470,9 +475,8 @@ static int try_read_node(const struct ubifs_info *c, void *buf, int type,
470 if (node_len != len) 475 if (node_len != len)
471 return 0; 476 return 0;
472 477
473 if (type == UBIFS_DATA_NODE && !c->always_chk_crc) 478 if (type == UBIFS_DATA_NODE && !c->always_chk_crc && c->no_chk_data_crc)
474 if (c->no_chk_data_crc) 479 return 1;
475 return 0;
476 480
477 crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8); 481 crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8);
478 node_crc = le32_to_cpu(ch->crc); 482 node_crc = le32_to_cpu(ch->crc);
@@ -1506,7 +1510,7 @@ out:
1506 * 1510 *
1507 * Note, if the bulk-read buffer length (@bu->buf_len) is known, this function 1511 * Note, if the bulk-read buffer length (@bu->buf_len) is known, this function
1508 * makes sure bulk-read nodes fit the buffer. Otherwise, this function prepares 1512 * makes sure bulk-read nodes fit the buffer. Otherwise, this function prepares
1509 * maxumum possible amount of nodes for bulk-read. 1513 * maximum possible amount of nodes for bulk-read.
1510 */ 1514 */
1511int ubifs_tnc_get_bu_keys(struct ubifs_info *c, struct bu_info *bu) 1515int ubifs_tnc_get_bu_keys(struct ubifs_info *c, struct bu_info *bu)
1512{ 1516{
@@ -2245,12 +2249,11 @@ int ubifs_tnc_replace(struct ubifs_info *c, const union ubifs_key *key,
2245 if (found) { 2249 if (found) {
2246 /* Ensure the znode is dirtied */ 2250 /* Ensure the znode is dirtied */
2247 if (znode->cnext || !ubifs_zn_dirty(znode)) { 2251 if (znode->cnext || !ubifs_zn_dirty(znode)) {
2248 znode = dirty_cow_bottom_up(c, 2252 znode = dirty_cow_bottom_up(c, znode);
2249 znode); 2253 if (IS_ERR(znode)) {
2250 if (IS_ERR(znode)) { 2254 err = PTR_ERR(znode);
2251 err = PTR_ERR(znode); 2255 goto out_unlock;
2252 goto out_unlock; 2256 }
2253 }
2254 } 2257 }
2255 zbr = &znode->zbranch[n]; 2258 zbr = &znode->zbranch[n];
2256 lnc_free(zbr); 2259 lnc_free(zbr);
@@ -2317,11 +2320,11 @@ int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key,
2317 2320
2318 /* Ensure the znode is dirtied */ 2321 /* Ensure the znode is dirtied */
2319 if (znode->cnext || !ubifs_zn_dirty(znode)) { 2322 if (znode->cnext || !ubifs_zn_dirty(znode)) {
2320 znode = dirty_cow_bottom_up(c, znode); 2323 znode = dirty_cow_bottom_up(c, znode);
2321 if (IS_ERR(znode)) { 2324 if (IS_ERR(znode)) {
2322 err = PTR_ERR(znode); 2325 err = PTR_ERR(znode);
2323 goto out_unlock; 2326 goto out_unlock;
2324 } 2327 }
2325 } 2328 }
2326 2329
2327 if (found == 1) { 2330 if (found == 1) {
@@ -2627,11 +2630,11 @@ int ubifs_tnc_remove_range(struct ubifs_info *c, union ubifs_key *from_key,
2627 2630
2628 /* Ensure the znode is dirtied */ 2631 /* Ensure the znode is dirtied */
2629 if (znode->cnext || !ubifs_zn_dirty(znode)) { 2632 if (znode->cnext || !ubifs_zn_dirty(znode)) {
2630 znode = dirty_cow_bottom_up(c, znode); 2633 znode = dirty_cow_bottom_up(c, znode);
2631 if (IS_ERR(znode)) { 2634 if (IS_ERR(znode)) {
2632 err = PTR_ERR(znode); 2635 err = PTR_ERR(znode);
2633 goto out_unlock; 2636 goto out_unlock;
2634 } 2637 }
2635 } 2638 }
2636 2639
2637 /* Remove all keys in range except the first */ 2640 /* Remove all keys in range except the first */
diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c
index 8ac76b1c2d55..fde8d127c768 100644
--- a/fs/ubifs/tnc_commit.c
+++ b/fs/ubifs/tnc_commit.c
@@ -553,8 +553,8 @@ static int layout_in_empty_space(struct ubifs_info *c)
553 } 553 }
554 554
555#ifdef CONFIG_UBIFS_FS_DEBUG 555#ifdef CONFIG_UBIFS_FS_DEBUG
556 c->new_ihead_lnum = lnum; 556 c->dbg->new_ihead_lnum = lnum;
557 c->new_ihead_offs = buf_offs; 557 c->dbg->new_ihead_offs = buf_offs;
558#endif 558#endif
559 559
560 return 0; 560 return 0;
@@ -802,8 +802,10 @@ int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot)
802 * budgeting subsystem to assume the index is already committed, 802 * budgeting subsystem to assume the index is already committed,
803 * even though it is not. 803 * even though it is not.
804 */ 804 */
805 ubifs_assert(c->min_idx_lebs == ubifs_calc_min_idx_lebs(c));
805 c->old_idx_sz = c->calc_idx_sz; 806 c->old_idx_sz = c->calc_idx_sz;
806 c->budg_uncommitted_idx = 0; 807 c->budg_uncommitted_idx = 0;
808 c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
807 spin_unlock(&c->space_lock); 809 spin_unlock(&c->space_lock);
808 mutex_unlock(&c->tnc_mutex); 810 mutex_unlock(&c->tnc_mutex);
809 811
@@ -1002,7 +1004,8 @@ static int write_index(struct ubifs_info *c)
1002 } 1004 }
1003 1005
1004#ifdef CONFIG_UBIFS_FS_DEBUG 1006#ifdef CONFIG_UBIFS_FS_DEBUG
1005 if (lnum != c->new_ihead_lnum || buf_offs != c->new_ihead_offs) { 1007 if (lnum != c->dbg->new_ihead_lnum ||
1008 buf_offs != c->dbg->new_ihead_offs) {
1006 ubifs_err("inconsistent ihead"); 1009 ubifs_err("inconsistent ihead");
1007 return -EINVAL; 1010 return -EINVAL;
1008 } 1011 }
diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h
index 0b378042a3a2..b25fc36cf72f 100644
--- a/fs/ubifs/ubifs-media.h
+++ b/fs/ubifs/ubifs-media.h
@@ -51,6 +51,13 @@
51 */ 51 */
52#define UBIFS_MIN_COMPR_LEN 128 52#define UBIFS_MIN_COMPR_LEN 128
53 53
54/*
55 * If compressed data length is less than %UBIFS_MIN_COMPRESS_DIFF bytes
56 * shorter than uncompressed data length, UBIFS preferes to leave this data
57 * node uncompress, because it'll be read faster.
58 */
59#define UBIFS_MIN_COMPRESS_DIFF 64
60
54/* Root inode number */ 61/* Root inode number */
55#define UBIFS_ROOT_INO 1 62#define UBIFS_ROOT_INO 1
56 63
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 46b172560a06..039a68bee29a 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -63,6 +63,14 @@
63#define SQNUM_WARN_WATERMARK 0xFFFFFFFF00000000ULL 63#define SQNUM_WARN_WATERMARK 0xFFFFFFFF00000000ULL
64#define SQNUM_WATERMARK 0xFFFFFFFFFF000000ULL 64#define SQNUM_WATERMARK 0xFFFFFFFFFF000000ULL
65 65
66/*
67 * Minimum amount of LEBs reserved for the index. At present the index needs at
68 * least 2 LEBs: one for the index head and one for in-the-gaps method (which
69 * currently does not cater for the index head and so excludes it from
70 * consideration).
71 */
72#define MIN_INDEX_LEBS 2
73
66/* Minimum amount of data UBIFS writes to the flash */ 74/* Minimum amount of data UBIFS writes to the flash */
67#define MIN_WRITE_SZ (UBIFS_DATA_NODE_SZ + 8) 75#define MIN_WRITE_SZ (UBIFS_DATA_NODE_SZ + 8)
68 76
@@ -386,12 +394,12 @@ struct ubifs_inode {
386 unsigned int dirty:1; 394 unsigned int dirty:1;
387 unsigned int xattr:1; 395 unsigned int xattr:1;
388 unsigned int bulk_read:1; 396 unsigned int bulk_read:1;
397 unsigned int compr_type:2;
389 struct mutex ui_mutex; 398 struct mutex ui_mutex;
390 spinlock_t ui_lock; 399 spinlock_t ui_lock;
391 loff_t synced_i_size; 400 loff_t synced_i_size;
392 loff_t ui_size; 401 loff_t ui_size;
393 int flags; 402 int flags;
394 int compr_type;
395 pgoff_t last_page_read; 403 pgoff_t last_page_read;
396 pgoff_t read_in_a_row; 404 pgoff_t read_in_a_row;
397 int data_len; 405 int data_len;
@@ -418,9 +426,9 @@ struct ubifs_unclean_leb {
418 * LEB properties flags. 426 * LEB properties flags.
419 * 427 *
420 * LPROPS_UNCAT: not categorized 428 * LPROPS_UNCAT: not categorized
421 * LPROPS_DIRTY: dirty > 0, not index 429 * LPROPS_DIRTY: dirty > free, dirty >= @c->dead_wm, not index
422 * LPROPS_DIRTY_IDX: dirty + free > UBIFS_CH_SZ and index 430 * LPROPS_DIRTY_IDX: dirty + free > @c->min_idx_node_sze and index
423 * LPROPS_FREE: free > 0, not empty, not index 431 * LPROPS_FREE: free > 0, dirty < @c->dead_wm, not empty, not index
424 * LPROPS_HEAP_CNT: number of heaps used for storing categorized LEBs 432 * LPROPS_HEAP_CNT: number of heaps used for storing categorized LEBs
425 * LPROPS_EMPTY: LEB is empty, not taken 433 * LPROPS_EMPTY: LEB is empty, not taken
426 * LPROPS_FREEABLE: free + dirty == leb_size, not index, not taken 434 * LPROPS_FREEABLE: free + dirty == leb_size, not index, not taken
@@ -473,8 +481,8 @@ struct ubifs_lprops {
473struct ubifs_lpt_lprops { 481struct ubifs_lpt_lprops {
474 int free; 482 int free;
475 int dirty; 483 int dirty;
476 unsigned tgc : 1; 484 unsigned tgc:1;
477 unsigned cmt : 1; 485 unsigned cmt:1;
478}; 486};
479 487
480/** 488/**
@@ -482,24 +490,26 @@ struct ubifs_lpt_lprops {
482 * @empty_lebs: number of empty LEBs 490 * @empty_lebs: number of empty LEBs
483 * @taken_empty_lebs: number of taken LEBs 491 * @taken_empty_lebs: number of taken LEBs
484 * @idx_lebs: number of indexing LEBs 492 * @idx_lebs: number of indexing LEBs
485 * @total_free: total free space in bytes 493 * @total_free: total free space in bytes (includes all LEBs)
486 * @total_dirty: total dirty space in bytes 494 * @total_dirty: total dirty space in bytes (includes all LEBs)
487 * @total_used: total used space in bytes (includes only data LEBs) 495 * @total_used: total used space in bytes (does not include index LEBs)
488 * @total_dead: total dead space in bytes (includes only data LEBs) 496 * @total_dead: total dead space in bytes (does not include index LEBs)
489 * @total_dark: total dark space in bytes (includes only data LEBs) 497 * @total_dark: total dark space in bytes (does not include index LEBs)
498 *
499 * The @taken_empty_lebs field counts the LEBs that are in the transient state
500 * of having been "taken" for use but not yet written to. @taken_empty_lebs is
501 * needed to account correctly for @gc_lnum, otherwise @empty_lebs could be
502 * used by itself (in which case 'unused_lebs' would be a better name). In the
503 * case of @gc_lnum, it is "taken" at mount time or whenever a LEB is retained
504 * by GC, but unlike other empty LEBs that are "taken", it may not be written
505 * straight away (i.e. before the next commit start or unmount), so either
506 * @gc_lnum must be specially accounted for, or the current approach followed
507 * i.e. count it under @taken_empty_lebs.
490 * 508 *
491 * N.B. total_dirty and total_used are different to other total_* fields, 509 * @empty_lebs includes @taken_empty_lebs.
492 * because they account _all_ LEBs, not just data LEBs.
493 * 510 *
494 * 'taken_empty_lebs' counts the LEBs that are in the transient state of having 511 * @total_used, @total_dead and @total_dark fields do not account indexing
495 * been 'taken' for use but not yet written to. 'taken_empty_lebs' is needed 512 * LEBs.
496 * to account correctly for gc_lnum, otherwise 'empty_lebs' could be used
497 * by itself (in which case 'unused_lebs' would be a better name). In the case
498 * of gc_lnum, it is 'taken' at mount time or whenever a LEB is retained by GC,
499 * but unlike other empty LEBs that are 'taken', it may not be written straight
500 * away (i.e. before the next commit start or unmount), so either gc_lnum must
501 * be specially accounted for, or the current approach followed i.e. count it
502 * under 'taken_empty_lebs'.
503 */ 513 */
504struct ubifs_lp_stats { 514struct ubifs_lp_stats {
505 int empty_lebs; 515 int empty_lebs;
@@ -893,15 +903,25 @@ struct ubifs_orphan {
893/** 903/**
894 * struct ubifs_mount_opts - UBIFS-specific mount options information. 904 * struct ubifs_mount_opts - UBIFS-specific mount options information.
895 * @unmount_mode: selected unmount mode (%0 default, %1 normal, %2 fast) 905 * @unmount_mode: selected unmount mode (%0 default, %1 normal, %2 fast)
896 * @bulk_read: enable bulk-reads 906 * @bulk_read: enable/disable bulk-reads (%0 default, %1 disabe, %2 enable)
897 * @chk_data_crc: check CRCs when reading data nodes 907 * @chk_data_crc: enable/disable CRC data checking when reading data nodes
908 * (%0 default, %1 disabe, %2 enable)
909 * @override_compr: override default compressor (%0 - do not override and use
910 * superblock compressor, %1 - override and use compressor
911 * specified in @compr_type)
912 * @compr_type: compressor type to override the superblock compressor with
913 * (%UBIFS_COMPR_NONE, etc)
898 */ 914 */
899struct ubifs_mount_opts { 915struct ubifs_mount_opts {
900 unsigned int unmount_mode:2; 916 unsigned int unmount_mode:2;
901 unsigned int bulk_read:2; 917 unsigned int bulk_read:2;
902 unsigned int chk_data_crc:2; 918 unsigned int chk_data_crc:2;
919 unsigned int override_compr:1;
920 unsigned int compr_type:2;
903}; 921};
904 922
923struct ubifs_debug_info;
924
905/** 925/**
906 * struct ubifs_info - UBIFS file-system description data structure 926 * struct ubifs_info - UBIFS file-system description data structure
907 * (per-superblock). 927 * (per-superblock).
@@ -941,11 +961,11 @@ struct ubifs_mount_opts {
941 * @cs_lock: commit state lock 961 * @cs_lock: commit state lock
942 * @cmt_wq: wait queue to sleep on if the log is full and a commit is running 962 * @cmt_wq: wait queue to sleep on if the log is full and a commit is running
943 * 963 *
944 * @fast_unmount: do not run journal commit before un-mounting
945 * @big_lpt: flag that LPT is too big to write whole during commit 964 * @big_lpt: flag that LPT is too big to write whole during commit
946 * @no_chk_data_crc: do not check CRCs when reading data nodes (except during 965 * @no_chk_data_crc: do not check CRCs when reading data nodes (except during
947 * recovery) 966 * recovery)
948 * @bulk_read: enable bulk-reads 967 * @bulk_read: enable bulk-reads
968 * @default_compr: default compression algorithm (%UBIFS_COMPR_LZO, etc)
949 * 969 *
950 * @tnc_mutex: protects the Tree Node Cache (TNC), @zroot, @cnext, @enext, and 970 * @tnc_mutex: protects the Tree Node Cache (TNC), @zroot, @cnext, @enext, and
951 * @calc_idx_sz 971 * @calc_idx_sz
@@ -963,8 +983,6 @@ struct ubifs_mount_opts {
963 * @ileb_nxt: next pre-allocated index LEBs 983 * @ileb_nxt: next pre-allocated index LEBs
964 * @old_idx: tree of index nodes obsoleted since the last commit start 984 * @old_idx: tree of index nodes obsoleted since the last commit start
965 * @bottom_up_buf: a buffer which is used by 'dirty_cow_bottom_up()' in tnc.c 985 * @bottom_up_buf: a buffer which is used by 'dirty_cow_bottom_up()' in tnc.c
966 * @new_ihead_lnum: used by debugging to check ihead_lnum
967 * @new_ihead_offs: used by debugging to check ihead_offs
968 * 986 *
969 * @mst_node: master node 987 * @mst_node: master node
970 * @mst_offs: offset of valid master node 988 * @mst_offs: offset of valid master node
@@ -986,7 +1004,6 @@ struct ubifs_mount_opts {
986 * @main_lebs: count of LEBs in the main area 1004 * @main_lebs: count of LEBs in the main area
987 * @main_first: first LEB of the main area 1005 * @main_first: first LEB of the main area
988 * @main_bytes: main area size in bytes 1006 * @main_bytes: main area size in bytes
989 * @default_compr: default compression algorithm (%UBIFS_COMPR_LZO, etc)
990 * 1007 *
991 * @key_hash_type: type of the key hash 1008 * @key_hash_type: type of the key hash
992 * @key_hash: direntry key hash function 1009 * @key_hash: direntry key hash function
@@ -1149,15 +1166,7 @@ struct ubifs_mount_opts {
1149 * @always_chk_crc: always check CRCs (while mounting and remounting rw) 1166 * @always_chk_crc: always check CRCs (while mounting and remounting rw)
1150 * @mount_opts: UBIFS-specific mount options 1167 * @mount_opts: UBIFS-specific mount options
1151 * 1168 *
1152 * @dbg_buf: a buffer of LEB size used for debugging purposes 1169 * @dbg: debugging-related information
1153 * @old_zroot: old index root - used by 'dbg_check_old_index()'
1154 * @old_zroot_level: old index root level - used by 'dbg_check_old_index()'
1155 * @old_zroot_sqnum: old index root sqnum - used by 'dbg_check_old_index()'
1156 * @failure_mode: failure mode for recovery testing
1157 * @fail_delay: 0=>don't delay, 1=>delay a time, 2=>delay a number of calls
1158 * @fail_timeout: time in jiffies when delay of failure mode expires
1159 * @fail_cnt: current number of calls to failure mode I/O functions
1160 * @fail_cnt_max: number of calls by which to delay failure mode
1161 */ 1170 */
1162struct ubifs_info { 1171struct ubifs_info {
1163 struct super_block *vfs_sb; 1172 struct super_block *vfs_sb;
@@ -1192,10 +1201,10 @@ struct ubifs_info {
1192 spinlock_t cs_lock; 1201 spinlock_t cs_lock;
1193 wait_queue_head_t cmt_wq; 1202 wait_queue_head_t cmt_wq;
1194 1203
1195 unsigned int fast_unmount:1;
1196 unsigned int big_lpt:1; 1204 unsigned int big_lpt:1;
1197 unsigned int no_chk_data_crc:1; 1205 unsigned int no_chk_data_crc:1;
1198 unsigned int bulk_read:1; 1206 unsigned int bulk_read:1;
1207 unsigned int default_compr:2;
1199 1208
1200 struct mutex tnc_mutex; 1209 struct mutex tnc_mutex;
1201 struct ubifs_zbranch zroot; 1210 struct ubifs_zbranch zroot;
@@ -1212,10 +1221,6 @@ struct ubifs_info {
1212 int ileb_nxt; 1221 int ileb_nxt;
1213 struct rb_root old_idx; 1222 struct rb_root old_idx;
1214 int *bottom_up_buf; 1223 int *bottom_up_buf;
1215#ifdef CONFIG_UBIFS_FS_DEBUG
1216 int new_ihead_lnum;
1217 int new_ihead_offs;
1218#endif
1219 1224
1220 struct ubifs_mst_node *mst_node; 1225 struct ubifs_mst_node *mst_node;
1221 int mst_offs; 1226 int mst_offs;
@@ -1237,7 +1242,6 @@ struct ubifs_info {
1237 int main_lebs; 1242 int main_lebs;
1238 int main_first; 1243 int main_first;
1239 long long main_bytes; 1244 long long main_bytes;
1240 int default_compr;
1241 1245
1242 uint8_t key_hash_type; 1246 uint8_t key_hash_type;
1243 uint32_t (*key_hash)(const char *str, int len); 1247 uint32_t (*key_hash)(const char *str, int len);
@@ -1315,8 +1319,8 @@ struct ubifs_info {
1315 void *sbuf; 1319 void *sbuf;
1316 struct list_head idx_gc; 1320 struct list_head idx_gc;
1317 int idx_gc_cnt; 1321 int idx_gc_cnt;
1318 volatile int gc_seq; 1322 int gc_seq;
1319 volatile int gced_lnum; 1323 int gced_lnum;
1320 1324
1321 struct list_head infos_list; 1325 struct list_head infos_list;
1322 struct mutex umount_mutex; 1326 struct mutex umount_mutex;
@@ -1391,21 +1395,7 @@ struct ubifs_info {
1391 struct ubifs_mount_opts mount_opts; 1395 struct ubifs_mount_opts mount_opts;
1392 1396
1393#ifdef CONFIG_UBIFS_FS_DEBUG 1397#ifdef CONFIG_UBIFS_FS_DEBUG
1394 void *dbg_buf; 1398 struct ubifs_debug_info *dbg;
1395 struct ubifs_zbranch old_zroot;
1396 int old_zroot_level;
1397 unsigned long long old_zroot_sqnum;
1398 int failure_mode;
1399 int fail_delay;
1400 unsigned long fail_timeout;
1401 unsigned int fail_cnt;
1402 unsigned int fail_cnt_max;
1403 long long chk_lpt_sz;
1404 long long chk_lpt_sz2;
1405 long long chk_lpt_wastage;
1406 int chk_lpt_lebs;
1407 int new_nhead_lnum;
1408 int new_nhead_offs;
1409#endif 1399#endif
1410}; 1400};
1411 1401
@@ -1413,13 +1403,13 @@ extern struct list_head ubifs_infos;
1413extern spinlock_t ubifs_infos_lock; 1403extern spinlock_t ubifs_infos_lock;
1414extern atomic_long_t ubifs_clean_zn_cnt; 1404extern atomic_long_t ubifs_clean_zn_cnt;
1415extern struct kmem_cache *ubifs_inode_slab; 1405extern struct kmem_cache *ubifs_inode_slab;
1416extern struct super_operations ubifs_super_operations; 1406extern const struct super_operations ubifs_super_operations;
1417extern struct address_space_operations ubifs_file_address_operations; 1407extern const struct address_space_operations ubifs_file_address_operations;
1418extern struct file_operations ubifs_file_operations; 1408extern const struct file_operations ubifs_file_operations;
1419extern struct inode_operations ubifs_file_inode_operations; 1409extern const struct inode_operations ubifs_file_inode_operations;
1420extern struct file_operations ubifs_dir_operations; 1410extern const struct file_operations ubifs_dir_operations;
1421extern struct inode_operations ubifs_dir_inode_operations; 1411extern const struct inode_operations ubifs_dir_inode_operations;
1422extern struct inode_operations ubifs_symlink_inode_operations; 1412extern const struct inode_operations ubifs_symlink_inode_operations;
1423extern struct backing_dev_info ubifs_backing_dev_info; 1413extern struct backing_dev_info ubifs_backing_dev_info;
1424extern struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT]; 1414extern struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT];
1425 1415
@@ -1436,7 +1426,7 @@ int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len,
1436int ubifs_write_node(struct ubifs_info *c, void *node, int len, int lnum, 1426int ubifs_write_node(struct ubifs_info *c, void *node, int len, int lnum,
1437 int offs, int dtype); 1427 int offs, int dtype);
1438int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum, 1428int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
1439 int offs, int quiet, int chk_crc); 1429 int offs, int quiet, int must_chk_crc);
1440void ubifs_prepare_node(struct ubifs_info *c, void *buf, int len, int pad); 1430void ubifs_prepare_node(struct ubifs_info *c, void *buf, int len, int pad);
1441void ubifs_prep_grp_node(struct ubifs_info *c, void *node, int len, int last); 1431void ubifs_prep_grp_node(struct ubifs_info *c, void *node, int len, int last);
1442int ubifs_io_init(struct ubifs_info *c); 1432int ubifs_io_init(struct ubifs_info *c);
@@ -1503,9 +1493,10 @@ void ubifs_release_ino_dirty(struct ubifs_info *c, struct inode *inode,
1503void ubifs_cancel_ino_op(struct ubifs_info *c, struct inode *inode, 1493void ubifs_cancel_ino_op(struct ubifs_info *c, struct inode *inode,
1504 struct ubifs_budget_req *req); 1494 struct ubifs_budget_req *req);
1505long long ubifs_get_free_space(struct ubifs_info *c); 1495long long ubifs_get_free_space(struct ubifs_info *c);
1496long long ubifs_get_free_space_nolock(struct ubifs_info *c);
1506int ubifs_calc_min_idx_lebs(struct ubifs_info *c); 1497int ubifs_calc_min_idx_lebs(struct ubifs_info *c);
1507void ubifs_convert_page_budget(struct ubifs_info *c); 1498void ubifs_convert_page_budget(struct ubifs_info *c);
1508long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free); 1499long long ubifs_reported_space(const struct ubifs_info *c, long long free);
1509long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs); 1500long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs);
1510 1501
1511/* find.c */ 1502/* find.c */
@@ -1611,6 +1602,7 @@ void ubifs_delete_orphan(struct ubifs_info *c, ino_t inum);
1611int ubifs_orphan_start_commit(struct ubifs_info *c); 1602int ubifs_orphan_start_commit(struct ubifs_info *c);
1612int ubifs_orphan_end_commit(struct ubifs_info *c); 1603int ubifs_orphan_end_commit(struct ubifs_info *c);
1613int ubifs_mount_orphans(struct ubifs_info *c, int unclean, int read_only); 1604int ubifs_mount_orphans(struct ubifs_info *c, int unclean, int read_only);
1605int ubifs_clear_orphans(struct ubifs_info *c);
1614 1606
1615/* lpt.c */ 1607/* lpt.c */
1616int ubifs_calc_lpt_geom(struct ubifs_info *c); 1608int ubifs_calc_lpt_geom(struct ubifs_info *c);
@@ -1639,6 +1631,9 @@ void ubifs_add_lpt_dirt(struct ubifs_info *c, int lnum, int dirty);
1639void ubifs_add_nnode_dirt(struct ubifs_info *c, struct ubifs_nnode *nnode); 1631void ubifs_add_nnode_dirt(struct ubifs_info *c, struct ubifs_nnode *nnode);
1640uint32_t ubifs_unpack_bits(uint8_t **addr, int *pos, int nrbits); 1632uint32_t ubifs_unpack_bits(uint8_t **addr, int *pos, int nrbits);
1641struct ubifs_nnode *ubifs_first_nnode(struct ubifs_info *c, int *hght); 1633struct ubifs_nnode *ubifs_first_nnode(struct ubifs_info *c, int *hght);
1634/* Needed only in debugging code in lpt_commit.c */
1635int ubifs_unpack_nnode(const struct ubifs_info *c, void *buf,
1636 struct ubifs_nnode *nnode);
1642 1637
1643/* lpt_commit.c */ 1638/* lpt_commit.c */
1644int ubifs_lpt_start_commit(struct ubifs_info *c); 1639int ubifs_lpt_start_commit(struct ubifs_info *c);
@@ -1651,7 +1646,7 @@ const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
1651 const struct ubifs_lprops *lp, 1646 const struct ubifs_lprops *lp,
1652 int free, int dirty, int flags, 1647 int free, int dirty, int flags,
1653 int idx_gc_cnt); 1648 int idx_gc_cnt);
1654void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *stats); 1649void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *lst);
1655void ubifs_add_to_cat(struct ubifs_info *c, struct ubifs_lprops *lprops, 1650void ubifs_add_to_cat(struct ubifs_info *c, struct ubifs_lprops *lprops,
1656 int cat); 1651 int cat);
1657void ubifs_replace_cat(struct ubifs_info *c, struct ubifs_lprops *old_lprops, 1652void ubifs_replace_cat(struct ubifs_info *c, struct ubifs_lprops *old_lprops,
@@ -1714,7 +1709,7 @@ long ubifs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
1714 1709
1715/* compressor.c */ 1710/* compressor.c */
1716int __init ubifs_compressors_init(void); 1711int __init ubifs_compressors_init(void);
1717void __exit ubifs_compressors_exit(void); 1712void ubifs_compressors_exit(void);
1718void ubifs_compress(const void *in_buf, int in_len, void *out_buf, int *out_len, 1713void ubifs_compress(const void *in_buf, int in_len, void *out_buf, int *out_len,
1719 int *compr_type); 1714 int *compr_type);
1720int ubifs_decompress(const void *buf, int len, void *out, int *out_len, 1715int ubifs_decompress(const void *buf, int len, void *out, int *out_len,
diff --git a/fs/udf/Kconfig b/fs/udf/Kconfig
new file mode 100644
index 000000000000..0e0e99bd6bce
--- /dev/null
+++ b/fs/udf/Kconfig
@@ -0,0 +1,18 @@
1config UDF_FS
2 tristate "UDF file system support"
3 select CRC_ITU_T
4 help
5 This is the new file system used on some CD-ROMs and DVDs. Say Y if
6 you intend to mount DVD discs or CDRW's written in packet mode, or
7 if written to by other UDF utilities, such as DirectCD.
8 Please read <file:Documentation/filesystems/udf.txt>.
9
10 To compile this file system support as a module, choose M here: the
11 module will be called udf.
12
13 If unsure, say N.
14
15config UDF_NLS
16 bool
17 default y
18 depends on (UDF_FS=m && NLS) || (UDF_FS=y && NLS=y)
diff --git a/fs/ufs/Kconfig b/fs/ufs/Kconfig
new file mode 100644
index 000000000000..e4f10a40768a
--- /dev/null
+++ b/fs/ufs/Kconfig
@@ -0,0 +1,43 @@
1config UFS_FS
2 tristate "UFS file system support (read only)"
3 depends on BLOCK
4 help
5 BSD and derivate versions of Unix (such as SunOS, FreeBSD, NetBSD,
6 OpenBSD and NeXTstep) use a file system called UFS. Some System V
7 Unixes can create and mount hard disk partitions and diskettes using
8 this file system as well. Saying Y here will allow you to read from
9 these partitions; if you also want to write to them, say Y to the
10 experimental "UFS file system write support", below. Please read the
11 file <file:Documentation/filesystems/ufs.txt> for more information.
12
13 The recently released UFS2 variant (used in FreeBSD 5.x) is
14 READ-ONLY supported.
15
16 Note that this option is generally not needed for floppies, since a
17 good portable way to transport files and directories between unixes
18 (and even other operating systems) is given by the tar program ("man
19 tar" or preferably "info tar").
20
21 When accessing NeXTstep files, you may need to convert them from the
22 NeXT character set to the Latin1 character set; use the program
23 recode ("info recode") for this purpose.
24
25 To compile the UFS file system support as a module, choose M here: the
26 module will be called ufs.
27
28 If you haven't heard about all of this before, it's safe to say N.
29
30config UFS_FS_WRITE
31 bool "UFS file system write support (DANGEROUS)"
32 depends on UFS_FS && EXPERIMENTAL
33 help
34 Say Y here if you want to try writing to UFS partitions. This is
35 experimental, so you should back up your UFS partitions beforehand.
36
37config UFS_DEBUG
38 bool "UFS debugging"
39 depends on UFS_FS
40 help
41 If you are experiencing any problems with the UFS filesystem, say
42 Y here. This will result in _many_ additional debugging messages to be
43 written to the system log.
diff --git a/fs/utimes.c b/fs/utimes.c
index 6929e3e91d05..e4c75db5d373 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -24,7 +24,7 @@
24 * must be owner or have write permission. 24 * must be owner or have write permission.
25 * Else, update from *times, must be owner or super user. 25 * Else, update from *times, must be owner or super user.
26 */ 26 */
27asmlinkage long sys_utime(char __user *filename, struct utimbuf __user *times) 27SYSCALL_DEFINE2(utime, char __user *, filename, struct utimbuf __user *, times)
28{ 28{
29 struct timespec tv[2]; 29 struct timespec tv[2];
30 30
@@ -170,7 +170,8 @@ out:
170 return error; 170 return error;
171} 171}
172 172
173asmlinkage long sys_utimensat(int dfd, char __user *filename, struct timespec __user *utimes, int flags) 173SYSCALL_DEFINE4(utimensat, int, dfd, char __user *, filename,
174 struct timespec __user *, utimes, int, flags)
174{ 175{
175 struct timespec tstimes[2]; 176 struct timespec tstimes[2];
176 177
@@ -187,7 +188,8 @@ asmlinkage long sys_utimensat(int dfd, char __user *filename, struct timespec __
187 return do_utimes(dfd, filename, utimes ? tstimes : NULL, flags); 188 return do_utimes(dfd, filename, utimes ? tstimes : NULL, flags);
188} 189}
189 190
190asmlinkage long sys_futimesat(int dfd, char __user *filename, struct timeval __user *utimes) 191SYSCALL_DEFINE3(futimesat, int, dfd, char __user *, filename,
192 struct timeval __user *, utimes)
191{ 193{
192 struct timeval times[2]; 194 struct timeval times[2];
193 struct timespec tstimes[2]; 195 struct timespec tstimes[2];
@@ -214,7 +216,8 @@ asmlinkage long sys_futimesat(int dfd, char __user *filename, struct timeval __u
214 return do_utimes(dfd, filename, utimes ? tstimes : NULL, 0); 216 return do_utimes(dfd, filename, utimes ? tstimes : NULL, 0);
215} 217}
216 218
217asmlinkage long sys_utimes(char __user *filename, struct timeval __user *utimes) 219SYSCALL_DEFINE2(utimes, char __user *, filename,
220 struct timeval __user *, utimes)
218{ 221{
219 return sys_futimesat(AT_FDCWD, filename, utimes); 222 return sys_futimesat(AT_FDCWD, filename, utimes);
220} 223}
diff --git a/fs/xattr.c b/fs/xattr.c
index 468377e66531..197c4fcac032 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -175,7 +175,7 @@ vfs_listxattr(struct dentry *d, char *list, size_t size)
175 if (error) 175 if (error)
176 return error; 176 return error;
177 error = -EOPNOTSUPP; 177 error = -EOPNOTSUPP;
178 if (d->d_inode->i_op && d->d_inode->i_op->listxattr) { 178 if (d->d_inode->i_op->listxattr) {
179 error = d->d_inode->i_op->listxattr(d, list, size); 179 error = d->d_inode->i_op->listxattr(d, list, size);
180 } else { 180 } else {
181 error = security_inode_listsecurity(d->d_inode, list, size); 181 error = security_inode_listsecurity(d->d_inode, list, size);
@@ -251,9 +251,9 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value,
251 return error; 251 return error;
252} 252}
253 253
254asmlinkage long 254SYSCALL_DEFINE5(setxattr, const char __user *, pathname,
255sys_setxattr(const char __user *pathname, const char __user *name, 255 const char __user *, name, const void __user *, value,
256 const void __user *value, size_t size, int flags) 256 size_t, size, int, flags)
257{ 257{
258 struct path path; 258 struct path path;
259 int error; 259 int error;
@@ -270,9 +270,9 @@ sys_setxattr(const char __user *pathname, const char __user *name,
270 return error; 270 return error;
271} 271}
272 272
273asmlinkage long 273SYSCALL_DEFINE5(lsetxattr, const char __user *, pathname,
274sys_lsetxattr(const char __user *pathname, const char __user *name, 274 const char __user *, name, const void __user *, value,
275 const void __user *value, size_t size, int flags) 275 size_t, size, int, flags)
276{ 276{
277 struct path path; 277 struct path path;
278 int error; 278 int error;
@@ -289,9 +289,8 @@ sys_lsetxattr(const char __user *pathname, const char __user *name,
289 return error; 289 return error;
290} 290}
291 291
292asmlinkage long 292SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name,
293sys_fsetxattr(int fd, const char __user *name, const void __user *value, 293 const void __user *,value, size_t, size, int, flags)
294 size_t size, int flags)
295{ 294{
296 struct file *f; 295 struct file *f;
297 struct dentry *dentry; 296 struct dentry *dentry;
@@ -349,9 +348,8 @@ getxattr(struct dentry *d, const char __user *name, void __user *value,
349 return error; 348 return error;
350} 349}
351 350
352asmlinkage ssize_t 351SYSCALL_DEFINE4(getxattr, const char __user *, pathname,
353sys_getxattr(const char __user *pathname, const char __user *name, 352 const char __user *, name, void __user *, value, size_t, size)
354 void __user *value, size_t size)
355{ 353{
356 struct path path; 354 struct path path;
357 ssize_t error; 355 ssize_t error;
@@ -364,9 +362,8 @@ sys_getxattr(const char __user *pathname, const char __user *name,
364 return error; 362 return error;
365} 363}
366 364
367asmlinkage ssize_t 365SYSCALL_DEFINE4(lgetxattr, const char __user *, pathname,
368sys_lgetxattr(const char __user *pathname, const char __user *name, void __user *value, 366 const char __user *, name, void __user *, value, size_t, size)
369 size_t size)
370{ 367{
371 struct path path; 368 struct path path;
372 ssize_t error; 369 ssize_t error;
@@ -379,8 +376,8 @@ sys_lgetxattr(const char __user *pathname, const char __user *name, void __user
379 return error; 376 return error;
380} 377}
381 378
382asmlinkage ssize_t 379SYSCALL_DEFINE4(fgetxattr, int, fd, const char __user *, name,
383sys_fgetxattr(int fd, const char __user *name, void __user *value, size_t size) 380 void __user *, value, size_t, size)
384{ 381{
385 struct file *f; 382 struct file *f;
386 ssize_t error = -EBADF; 383 ssize_t error = -EBADF;
@@ -424,8 +421,8 @@ listxattr(struct dentry *d, char __user *list, size_t size)
424 return error; 421 return error;
425} 422}
426 423
427asmlinkage ssize_t 424SYSCALL_DEFINE3(listxattr, const char __user *, pathname, char __user *, list,
428sys_listxattr(const char __user *pathname, char __user *list, size_t size) 425 size_t, size)
429{ 426{
430 struct path path; 427 struct path path;
431 ssize_t error; 428 ssize_t error;
@@ -438,8 +435,8 @@ sys_listxattr(const char __user *pathname, char __user *list, size_t size)
438 return error; 435 return error;
439} 436}
440 437
441asmlinkage ssize_t 438SYSCALL_DEFINE3(llistxattr, const char __user *, pathname, char __user *, list,
442sys_llistxattr(const char __user *pathname, char __user *list, size_t size) 439 size_t, size)
443{ 440{
444 struct path path; 441 struct path path;
445 ssize_t error; 442 ssize_t error;
@@ -452,8 +449,7 @@ sys_llistxattr(const char __user *pathname, char __user *list, size_t size)
452 return error; 449 return error;
453} 450}
454 451
455asmlinkage ssize_t 452SYSCALL_DEFINE3(flistxattr, int, fd, char __user *, list, size_t, size)
456sys_flistxattr(int fd, char __user *list, size_t size)
457{ 453{
458 struct file *f; 454 struct file *f;
459 ssize_t error = -EBADF; 455 ssize_t error = -EBADF;
@@ -485,8 +481,8 @@ removexattr(struct dentry *d, const char __user *name)
485 return vfs_removexattr(d, kname); 481 return vfs_removexattr(d, kname);
486} 482}
487 483
488asmlinkage long 484SYSCALL_DEFINE2(removexattr, const char __user *, pathname,
489sys_removexattr(const char __user *pathname, const char __user *name) 485 const char __user *, name)
490{ 486{
491 struct path path; 487 struct path path;
492 int error; 488 int error;
@@ -503,8 +499,8 @@ sys_removexattr(const char __user *pathname, const char __user *name)
503 return error; 499 return error;
504} 500}
505 501
506asmlinkage long 502SYSCALL_DEFINE2(lremovexattr, const char __user *, pathname,
507sys_lremovexattr(const char __user *pathname, const char __user *name) 503 const char __user *, name)
508{ 504{
509 struct path path; 505 struct path path;
510 int error; 506 int error;
@@ -521,8 +517,7 @@ sys_lremovexattr(const char __user *pathname, const char __user *name)
521 return error; 517 return error;
522} 518}
523 519
524asmlinkage long 520SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name)
525sys_fremovexattr(int fd, const char __user *name)
526{ 521{
527 struct file *f; 522 struct file *f;
528 struct dentry *dentry; 523 struct dentry *dentry;
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 3f53dd101f99..29228f5899cd 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -1,6 +1,7 @@
1config XFS_FS 1config XFS_FS
2 tristate "XFS filesystem support" 2 tristate "XFS filesystem support"
3 depends on BLOCK 3 depends on BLOCK
4 select EXPORTFS
4 help 5 help
5 XFS is a high performance journaling filesystem which originated 6 XFS is a high performance journaling filesystem which originated
6 on the SGI IRIX platform. It is completely multi-threaded, can 7 on the SGI IRIX platform. It is completely multi-threaded, can
diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h
index 7b26f5ff9692..1dd528849755 100644
--- a/fs/xfs/linux-2.6/xfs_aops.h
+++ b/fs/xfs/linux-2.6/xfs_aops.h
@@ -21,8 +21,6 @@
21extern struct workqueue_struct *xfsdatad_workqueue; 21extern struct workqueue_struct *xfsdatad_workqueue;
22extern mempool_t *xfs_ioend_pool; 22extern mempool_t *xfs_ioend_pool;
23 23
24typedef void (*xfs_ioend_func_t)(void *);
25
26/* 24/*
27 * xfs_ioend struct manages large extent writes for XFS. 25 * xfs_ioend struct manages large extent writes for XFS.
28 * It can manage several multi-page bio's at once. 26 * It can manage several multi-page bio's at once.
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index cb329edc925b..d71dc44e21ed 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -166,75 +166,6 @@ test_page_region(
166} 166}
167 167
168/* 168/*
169 * Mapping of multi-page buffers into contiguous virtual space
170 */
171
172typedef struct a_list {
173 void *vm_addr;
174 struct a_list *next;
175} a_list_t;
176
177static a_list_t *as_free_head;
178static int as_list_len;
179static DEFINE_SPINLOCK(as_lock);
180
181/*
182 * Try to batch vunmaps because they are costly.
183 */
184STATIC void
185free_address(
186 void *addr)
187{
188 a_list_t *aentry;
189
190#ifdef CONFIG_XEN
191 /*
192 * Xen needs to be able to make sure it can get an exclusive
193 * RO mapping of pages it wants to turn into a pagetable. If
194 * a newly allocated page is also still being vmap()ed by xfs,
195 * it will cause pagetable construction to fail. This is a
196 * quick workaround to always eagerly unmap pages so that Xen
197 * is happy.
198 */
199 vunmap(addr);
200 return;
201#endif
202
203 aentry = kmalloc(sizeof(a_list_t), GFP_NOWAIT);
204 if (likely(aentry)) {
205 spin_lock(&as_lock);
206 aentry->next = as_free_head;
207 aentry->vm_addr = addr;
208 as_free_head = aentry;
209 as_list_len++;
210 spin_unlock(&as_lock);
211 } else {
212 vunmap(addr);
213 }
214}
215
216STATIC void
217purge_addresses(void)
218{
219 a_list_t *aentry, *old;
220
221 if (as_free_head == NULL)
222 return;
223
224 spin_lock(&as_lock);
225 aentry = as_free_head;
226 as_free_head = NULL;
227 as_list_len = 0;
228 spin_unlock(&as_lock);
229
230 while ((old = aentry) != NULL) {
231 vunmap(aentry->vm_addr);
232 aentry = aentry->next;
233 kfree(old);
234 }
235}
236
237/*
238 * Internal xfs_buf_t object manipulation 169 * Internal xfs_buf_t object manipulation
239 */ 170 */
240 171
@@ -333,7 +264,7 @@ xfs_buf_free(
333 uint i; 264 uint i;
334 265
335 if ((bp->b_flags & XBF_MAPPED) && (bp->b_page_count > 1)) 266 if ((bp->b_flags & XBF_MAPPED) && (bp->b_page_count > 1))
336 free_address(bp->b_addr - bp->b_offset); 267 vm_unmap_ram(bp->b_addr - bp->b_offset, bp->b_page_count);
337 268
338 for (i = 0; i < bp->b_page_count; i++) { 269 for (i = 0; i < bp->b_page_count; i++) {
339 struct page *page = bp->b_pages[i]; 270 struct page *page = bp->b_pages[i];
@@ -455,10 +386,8 @@ _xfs_buf_map_pages(
455 bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset; 386 bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
456 bp->b_flags |= XBF_MAPPED; 387 bp->b_flags |= XBF_MAPPED;
457 } else if (flags & XBF_MAPPED) { 388 } else if (flags & XBF_MAPPED) {
458 if (as_list_len > 64) 389 bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
459 purge_addresses(); 390 -1, PAGE_KERNEL);
460 bp->b_addr = vmap(bp->b_pages, bp->b_page_count,
461 VM_MAP, PAGE_KERNEL);
462 if (unlikely(bp->b_addr == NULL)) 391 if (unlikely(bp->b_addr == NULL))
463 return -ENOMEM; 392 return -ENOMEM;
464 bp->b_addr += bp->b_offset; 393 bp->b_addr += bp->b_offset;
@@ -1743,8 +1672,6 @@ xfsbufd(
1743 count++; 1672 count++;
1744 } 1673 }
1745 1674
1746 if (as_list_len > 0)
1747 purge_addresses();
1748 if (count) 1675 if (count)
1749 blk_run_address_space(target->bt_mapping); 1676 blk_run_address_space(target->bt_mapping);
1750 1677
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index 595751f78350..87b8cbd23d4b 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -126,11 +126,26 @@ xfs_nfs_get_inode(
126 if (ino == 0) 126 if (ino == 0)
127 return ERR_PTR(-ESTALE); 127 return ERR_PTR(-ESTALE);
128 128
129 error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_SHARED, &ip, 0); 129 /*
130 if (error) 130 * The XFS_IGET_BULKSTAT means that an invalid inode number is just
131 * fine and not an indication of a corrupted filesystem. Because
132 * clients can send any kind of invalid file handle, e.g. after
133 * a restore on the server we have to deal with this case gracefully.
134 */
135 error = xfs_iget(mp, NULL, ino, XFS_IGET_BULKSTAT,
136 XFS_ILOCK_SHARED, &ip, 0);
137 if (error) {
138 /*
139 * EINVAL means the inode cluster doesn't exist anymore.
140 * This implies the filehandle is stale, so we should
141 * translate it here.
142 * We don't use ESTALE directly down the chain to not
143 * confuse applications using bulkstat that expect EINVAL.
144 */
145 if (error == EINVAL)
146 error = ESTALE;
131 return ERR_PTR(-error); 147 return ERR_PTR(-error);
132 if (!ip) 148 }
133 return ERR_PTR(-EIO);
134 149
135 if (ip->i_d.di_gen != generation) { 150 if (ip->i_d.di_gen != generation) {
136 xfs_iput_new(ip, XFS_ILOCK_SHARED); 151 xfs_iput_new(ip, XFS_ILOCK_SHARED);
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 67205f6198ba..4bd112313f33 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -50,12 +50,14 @@
50#include "xfs_vnodeops.h" 50#include "xfs_vnodeops.h"
51#include "xfs_quota.h" 51#include "xfs_quota.h"
52#include "xfs_inode_item.h" 52#include "xfs_inode_item.h"
53#include "xfs_export.h"
53 54
54#include <linux/capability.h> 55#include <linux/capability.h>
55#include <linux/dcache.h> 56#include <linux/dcache.h>
56#include <linux/mount.h> 57#include <linux/mount.h>
57#include <linux/namei.h> 58#include <linux/namei.h>
58#include <linux/pagemap.h> 59#include <linux/pagemap.h>
60#include <linux/exportfs.h>
59 61
60/* 62/*
61 * xfs_find_handle maps from userspace xfs_fsop_handlereq structure to 63 * xfs_find_handle maps from userspace xfs_fsop_handlereq structure to
@@ -164,97 +166,69 @@ xfs_find_handle(
164 return 0; 166 return 0;
165} 167}
166 168
167
168/* 169/*
169 * Convert userspace handle data into inode. 170 * No need to do permission checks on the various pathname components
170 * 171 * as the handle operations are privileged.
171 * We use the fact that all the fsop_handlereq ioctl calls have a data
172 * structure argument whose first component is always a xfs_fsop_handlereq_t,
173 * so we can pass that sub structure into this handy, shared routine.
174 *
175 * If no error, caller must always iput the returned inode.
176 */ 172 */
177STATIC int 173STATIC int
178xfs_vget_fsop_handlereq( 174xfs_handle_acceptable(
179 xfs_mount_t *mp, 175 void *context,
180 struct inode *parinode, /* parent inode pointer */ 176 struct dentry *dentry)
181 xfs_fsop_handlereq_t *hreq, 177{
182 struct inode **inode) 178 return 1;
179}
180
181/*
182 * Convert userspace handle data into a dentry.
183 */
184struct dentry *
185xfs_handle_to_dentry(
186 struct file *parfilp,
187 void __user *uhandle,
188 u32 hlen)
183{ 189{
184 void __user *hanp;
185 size_t hlen;
186 xfs_fid_t *xfid;
187 xfs_handle_t *handlep;
188 xfs_handle_t handle; 190 xfs_handle_t handle;
189 xfs_inode_t *ip; 191 struct xfs_fid64 fid;
190 xfs_ino_t ino;
191 __u32 igen;
192 int error;
193 192
194 /* 193 /*
195 * Only allow handle opens under a directory. 194 * Only allow handle opens under a directory.
196 */ 195 */
197 if (!S_ISDIR(parinode->i_mode)) 196 if (!S_ISDIR(parfilp->f_path.dentry->d_inode->i_mode))
198 return XFS_ERROR(ENOTDIR); 197 return ERR_PTR(-ENOTDIR);
199 198
200 hanp = hreq->ihandle; 199 if (hlen != sizeof(xfs_handle_t))
201 hlen = hreq->ihandlen; 200 return ERR_PTR(-EINVAL);
202 handlep = &handle; 201 if (copy_from_user(&handle, uhandle, hlen))
203 202 return ERR_PTR(-EFAULT);
204 if (hlen < sizeof(handlep->ha_fsid) || hlen > sizeof(*handlep)) 203 if (handle.ha_fid.fid_len !=
205 return XFS_ERROR(EINVAL); 204 sizeof(handle.ha_fid) - sizeof(handle.ha_fid.fid_len))
206 if (copy_from_user(handlep, hanp, hlen)) 205 return ERR_PTR(-EINVAL);
207 return XFS_ERROR(EFAULT); 206
208 if (hlen < sizeof(*handlep)) 207 memset(&fid, 0, sizeof(struct fid));
209 memset(((char *)handlep) + hlen, 0, sizeof(*handlep) - hlen); 208 fid.ino = handle.ha_fid.fid_ino;
210 if (hlen > sizeof(handlep->ha_fsid)) { 209 fid.gen = handle.ha_fid.fid_gen;
211 if (handlep->ha_fid.fid_len != 210
212 (hlen - sizeof(handlep->ha_fsid) - 211 return exportfs_decode_fh(parfilp->f_path.mnt, (struct fid *)&fid, 3,
213 sizeof(handlep->ha_fid.fid_len)) || 212 FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG,
214 handlep->ha_fid.fid_pad) 213 xfs_handle_acceptable, NULL);
215 return XFS_ERROR(EINVAL); 214}
216 }
217
218 /*
219 * Crack the handle, obtain the inode # & generation #
220 */
221 xfid = (struct xfs_fid *)&handlep->ha_fid;
222 if (xfid->fid_len == sizeof(*xfid) - sizeof(xfid->fid_len)) {
223 ino = xfid->fid_ino;
224 igen = xfid->fid_gen;
225 } else {
226 return XFS_ERROR(EINVAL);
227 }
228
229 /*
230 * Get the XFS inode, building a Linux inode to go with it.
231 */
232 error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_SHARED, &ip, 0);
233 if (error)
234 return error;
235 if (ip == NULL)
236 return XFS_ERROR(EIO);
237 if (ip->i_d.di_gen != igen) {
238 xfs_iput_new(ip, XFS_ILOCK_SHARED);
239 return XFS_ERROR(ENOENT);
240 }
241
242 xfs_iunlock(ip, XFS_ILOCK_SHARED);
243 215
244 *inode = VFS_I(ip); 216STATIC struct dentry *
245 return 0; 217xfs_handlereq_to_dentry(
218 struct file *parfilp,
219 xfs_fsop_handlereq_t *hreq)
220{
221 return xfs_handle_to_dentry(parfilp, hreq->ihandle, hreq->ihandlen);
246} 222}
247 223
248int 224int
249xfs_open_by_handle( 225xfs_open_by_handle(
250 xfs_mount_t *mp,
251 xfs_fsop_handlereq_t *hreq,
252 struct file *parfilp, 226 struct file *parfilp,
253 struct inode *parinode) 227 xfs_fsop_handlereq_t *hreq)
254{ 228{
255 const struct cred *cred = current_cred(); 229 const struct cred *cred = current_cred();
256 int error; 230 int error;
257 int new_fd; 231 int fd;
258 int permflag; 232 int permflag;
259 struct file *filp; 233 struct file *filp;
260 struct inode *inode; 234 struct inode *inode;
@@ -263,19 +237,21 @@ xfs_open_by_handle(
263 if (!capable(CAP_SYS_ADMIN)) 237 if (!capable(CAP_SYS_ADMIN))
264 return -XFS_ERROR(EPERM); 238 return -XFS_ERROR(EPERM);
265 239
266 error = xfs_vget_fsop_handlereq(mp, parinode, hreq, &inode); 240 dentry = xfs_handlereq_to_dentry(parfilp, hreq);
267 if (error) 241 if (IS_ERR(dentry))
268 return -error; 242 return PTR_ERR(dentry);
243 inode = dentry->d_inode;
269 244
270 /* Restrict xfs_open_by_handle to directories & regular files. */ 245 /* Restrict xfs_open_by_handle to directories & regular files. */
271 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) { 246 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) {
272 iput(inode); 247 error = -XFS_ERROR(EPERM);
273 return -XFS_ERROR(EINVAL); 248 goto out_dput;
274 } 249 }
275 250
276#if BITS_PER_LONG != 32 251#if BITS_PER_LONG != 32
277 hreq->oflags |= O_LARGEFILE; 252 hreq->oflags |= O_LARGEFILE;
278#endif 253#endif
254
279 /* Put open permission in namei format. */ 255 /* Put open permission in namei format. */
280 permflag = hreq->oflags; 256 permflag = hreq->oflags;
281 if ((permflag+1) & O_ACCMODE) 257 if ((permflag+1) & O_ACCMODE)
@@ -285,50 +261,45 @@ xfs_open_by_handle(
285 261
286 if ((!(permflag & O_APPEND) || (permflag & O_TRUNC)) && 262 if ((!(permflag & O_APPEND) || (permflag & O_TRUNC)) &&
287 (permflag & FMODE_WRITE) && IS_APPEND(inode)) { 263 (permflag & FMODE_WRITE) && IS_APPEND(inode)) {
288 iput(inode); 264 error = -XFS_ERROR(EPERM);
289 return -XFS_ERROR(EPERM); 265 goto out_dput;
290 } 266 }
291 267
292 if ((permflag & FMODE_WRITE) && IS_IMMUTABLE(inode)) { 268 if ((permflag & FMODE_WRITE) && IS_IMMUTABLE(inode)) {
293 iput(inode); 269 error = -XFS_ERROR(EACCES);
294 return -XFS_ERROR(EACCES); 270 goto out_dput;
295 } 271 }
296 272
297 /* Can't write directories. */ 273 /* Can't write directories. */
298 if ( S_ISDIR(inode->i_mode) && (permflag & FMODE_WRITE)) { 274 if (S_ISDIR(inode->i_mode) && (permflag & FMODE_WRITE)) {
299 iput(inode); 275 error = -XFS_ERROR(EISDIR);
300 return -XFS_ERROR(EISDIR); 276 goto out_dput;
301 }
302
303 if ((new_fd = get_unused_fd()) < 0) {
304 iput(inode);
305 return new_fd;
306 } 277 }
307 278
308 dentry = d_obtain_alias(inode); 279 fd = get_unused_fd();
309 if (IS_ERR(dentry)) { 280 if (fd < 0) {
310 put_unused_fd(new_fd); 281 error = fd;
311 return PTR_ERR(dentry); 282 goto out_dput;
312 } 283 }
313 284
314 /* Ensure umount returns EBUSY on umounts while this file is open. */ 285 filp = dentry_open(dentry, mntget(parfilp->f_path.mnt),
315 mntget(parfilp->f_path.mnt); 286 hreq->oflags, cred);
316
317 /* Create file pointer. */
318 filp = dentry_open(dentry, parfilp->f_path.mnt, hreq->oflags, cred);
319 if (IS_ERR(filp)) { 287 if (IS_ERR(filp)) {
320 put_unused_fd(new_fd); 288 put_unused_fd(fd);
321 return -XFS_ERROR(-PTR_ERR(filp)); 289 return PTR_ERR(filp);
322 } 290 }
323 291
324 if (inode->i_mode & S_IFREG) { 292 if (inode->i_mode & S_IFREG) {
325 /* invisible operation should not change atime */
326 filp->f_flags |= O_NOATIME; 293 filp->f_flags |= O_NOATIME;
327 filp->f_mode |= FMODE_NOCMTIME; 294 filp->f_mode |= FMODE_NOCMTIME;
328 } 295 }
329 296
330 fd_install(new_fd, filp); 297 fd_install(fd, filp);
331 return new_fd; 298 return fd;
299
300 out_dput:
301 dput(dentry);
302 return error;
332} 303}
333 304
334/* 305/*
@@ -359,11 +330,10 @@ do_readlink(
359 330
360int 331int
361xfs_readlink_by_handle( 332xfs_readlink_by_handle(
362 xfs_mount_t *mp, 333 struct file *parfilp,
363 xfs_fsop_handlereq_t *hreq, 334 xfs_fsop_handlereq_t *hreq)
364 struct inode *parinode)
365{ 335{
366 struct inode *inode; 336 struct dentry *dentry;
367 __u32 olen; 337 __u32 olen;
368 void *link; 338 void *link;
369 int error; 339 int error;
@@ -371,26 +341,28 @@ xfs_readlink_by_handle(
371 if (!capable(CAP_SYS_ADMIN)) 341 if (!capable(CAP_SYS_ADMIN))
372 return -XFS_ERROR(EPERM); 342 return -XFS_ERROR(EPERM);
373 343
374 error = xfs_vget_fsop_handlereq(mp, parinode, hreq, &inode); 344 dentry = xfs_handlereq_to_dentry(parfilp, hreq);
375 if (error) 345 if (IS_ERR(dentry))
376 return -error; 346 return PTR_ERR(dentry);
377 347
378 /* Restrict this handle operation to symlinks only. */ 348 /* Restrict this handle operation to symlinks only. */
379 if (!S_ISLNK(inode->i_mode)) { 349 if (!S_ISLNK(dentry->d_inode->i_mode)) {
380 error = -XFS_ERROR(EINVAL); 350 error = -XFS_ERROR(EINVAL);
381 goto out_iput; 351 goto out_dput;
382 } 352 }
383 353
384 if (copy_from_user(&olen, hreq->ohandlen, sizeof(__u32))) { 354 if (copy_from_user(&olen, hreq->ohandlen, sizeof(__u32))) {
385 error = -XFS_ERROR(EFAULT); 355 error = -XFS_ERROR(EFAULT);
386 goto out_iput; 356 goto out_dput;
387 } 357 }
388 358
389 link = kmalloc(MAXPATHLEN+1, GFP_KERNEL); 359 link = kmalloc(MAXPATHLEN+1, GFP_KERNEL);
390 if (!link) 360 if (!link) {
391 goto out_iput; 361 error = -XFS_ERROR(ENOMEM);
362 goto out_dput;
363 }
392 364
393 error = -xfs_readlink(XFS_I(inode), link); 365 error = -xfs_readlink(XFS_I(dentry->d_inode), link);
394 if (error) 366 if (error)
395 goto out_kfree; 367 goto out_kfree;
396 error = do_readlink(hreq->ohandle, olen, link); 368 error = do_readlink(hreq->ohandle, olen, link);
@@ -399,32 +371,31 @@ xfs_readlink_by_handle(
399 371
400 out_kfree: 372 out_kfree:
401 kfree(link); 373 kfree(link);
402 out_iput: 374 out_dput:
403 iput(inode); 375 dput(dentry);
404 return error; 376 return error;
405} 377}
406 378
407STATIC int 379STATIC int
408xfs_fssetdm_by_handle( 380xfs_fssetdm_by_handle(
409 xfs_mount_t *mp, 381 struct file *parfilp,
410 void __user *arg, 382 void __user *arg)
411 struct inode *parinode)
412{ 383{
413 int error; 384 int error;
414 struct fsdmidata fsd; 385 struct fsdmidata fsd;
415 xfs_fsop_setdm_handlereq_t dmhreq; 386 xfs_fsop_setdm_handlereq_t dmhreq;
416 struct inode *inode; 387 struct dentry *dentry;
417 388
418 if (!capable(CAP_MKNOD)) 389 if (!capable(CAP_MKNOD))
419 return -XFS_ERROR(EPERM); 390 return -XFS_ERROR(EPERM);
420 if (copy_from_user(&dmhreq, arg, sizeof(xfs_fsop_setdm_handlereq_t))) 391 if (copy_from_user(&dmhreq, arg, sizeof(xfs_fsop_setdm_handlereq_t)))
421 return -XFS_ERROR(EFAULT); 392 return -XFS_ERROR(EFAULT);
422 393
423 error = xfs_vget_fsop_handlereq(mp, parinode, &dmhreq.hreq, &inode); 394 dentry = xfs_handlereq_to_dentry(parfilp, &dmhreq.hreq);
424 if (error) 395 if (IS_ERR(dentry))
425 return -error; 396 return PTR_ERR(dentry);
426 397
427 if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) { 398 if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode)) {
428 error = -XFS_ERROR(EPERM); 399 error = -XFS_ERROR(EPERM);
429 goto out; 400 goto out;
430 } 401 }
@@ -434,24 +405,23 @@ xfs_fssetdm_by_handle(
434 goto out; 405 goto out;
435 } 406 }
436 407
437 error = -xfs_set_dmattrs(XFS_I(inode), fsd.fsd_dmevmask, 408 error = -xfs_set_dmattrs(XFS_I(dentry->d_inode), fsd.fsd_dmevmask,
438 fsd.fsd_dmstate); 409 fsd.fsd_dmstate);
439 410
440 out: 411 out:
441 iput(inode); 412 dput(dentry);
442 return error; 413 return error;
443} 414}
444 415
445STATIC int 416STATIC int
446xfs_attrlist_by_handle( 417xfs_attrlist_by_handle(
447 xfs_mount_t *mp, 418 struct file *parfilp,
448 void __user *arg, 419 void __user *arg)
449 struct inode *parinode)
450{ 420{
451 int error; 421 int error = -ENOMEM;
452 attrlist_cursor_kern_t *cursor; 422 attrlist_cursor_kern_t *cursor;
453 xfs_fsop_attrlist_handlereq_t al_hreq; 423 xfs_fsop_attrlist_handlereq_t al_hreq;
454 struct inode *inode; 424 struct dentry *dentry;
455 char *kbuf; 425 char *kbuf;
456 426
457 if (!capable(CAP_SYS_ADMIN)) 427 if (!capable(CAP_SYS_ADMIN))
@@ -467,16 +437,16 @@ xfs_attrlist_by_handle(
467 if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE)) 437 if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE))
468 return -XFS_ERROR(EINVAL); 438 return -XFS_ERROR(EINVAL);
469 439
470 error = xfs_vget_fsop_handlereq(mp, parinode, &al_hreq.hreq, &inode); 440 dentry = xfs_handlereq_to_dentry(parfilp, &al_hreq.hreq);
471 if (error) 441 if (IS_ERR(dentry))
472 goto out; 442 return PTR_ERR(dentry);
473 443
474 kbuf = kmalloc(al_hreq.buflen, GFP_KERNEL); 444 kbuf = kmalloc(al_hreq.buflen, GFP_KERNEL);
475 if (!kbuf) 445 if (!kbuf)
476 goto out_vn_rele; 446 goto out_dput;
477 447
478 cursor = (attrlist_cursor_kern_t *)&al_hreq.pos; 448 cursor = (attrlist_cursor_kern_t *)&al_hreq.pos;
479 error = xfs_attr_list(XFS_I(inode), kbuf, al_hreq.buflen, 449 error = -xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen,
480 al_hreq.flags, cursor); 450 al_hreq.flags, cursor);
481 if (error) 451 if (error)
482 goto out_kfree; 452 goto out_kfree;
@@ -486,10 +456,9 @@ xfs_attrlist_by_handle(
486 456
487 out_kfree: 457 out_kfree:
488 kfree(kbuf); 458 kfree(kbuf);
489 out_vn_rele: 459 out_dput:
490 iput(inode); 460 dput(dentry);
491 out: 461 return error;
492 return -error;
493} 462}
494 463
495int 464int
@@ -564,15 +533,13 @@ xfs_attrmulti_attr_remove(
564 533
565STATIC int 534STATIC int
566xfs_attrmulti_by_handle( 535xfs_attrmulti_by_handle(
567 xfs_mount_t *mp,
568 void __user *arg,
569 struct file *parfilp, 536 struct file *parfilp,
570 struct inode *parinode) 537 void __user *arg)
571{ 538{
572 int error; 539 int error;
573 xfs_attr_multiop_t *ops; 540 xfs_attr_multiop_t *ops;
574 xfs_fsop_attrmulti_handlereq_t am_hreq; 541 xfs_fsop_attrmulti_handlereq_t am_hreq;
575 struct inode *inode; 542 struct dentry *dentry;
576 unsigned int i, size; 543 unsigned int i, size;
577 char *attr_name; 544 char *attr_name;
578 545
@@ -581,19 +548,19 @@ xfs_attrmulti_by_handle(
581 if (copy_from_user(&am_hreq, arg, sizeof(xfs_fsop_attrmulti_handlereq_t))) 548 if (copy_from_user(&am_hreq, arg, sizeof(xfs_fsop_attrmulti_handlereq_t)))
582 return -XFS_ERROR(EFAULT); 549 return -XFS_ERROR(EFAULT);
583 550
584 error = xfs_vget_fsop_handlereq(mp, parinode, &am_hreq.hreq, &inode); 551 dentry = xfs_handlereq_to_dentry(parfilp, &am_hreq.hreq);
585 if (error) 552 if (IS_ERR(dentry))
586 goto out; 553 return PTR_ERR(dentry);
587 554
588 error = E2BIG; 555 error = E2BIG;
589 size = am_hreq.opcount * sizeof(xfs_attr_multiop_t); 556 size = am_hreq.opcount * sizeof(xfs_attr_multiop_t);
590 if (!size || size > 16 * PAGE_SIZE) 557 if (!size || size > 16 * PAGE_SIZE)
591 goto out_vn_rele; 558 goto out_dput;
592 559
593 error = ENOMEM; 560 error = ENOMEM;
594 ops = kmalloc(size, GFP_KERNEL); 561 ops = kmalloc(size, GFP_KERNEL);
595 if (!ops) 562 if (!ops)
596 goto out_vn_rele; 563 goto out_dput;
597 564
598 error = EFAULT; 565 error = EFAULT;
599 if (copy_from_user(ops, am_hreq.ops, size)) 566 if (copy_from_user(ops, am_hreq.ops, size))
@@ -615,25 +582,28 @@ xfs_attrmulti_by_handle(
615 582
616 switch (ops[i].am_opcode) { 583 switch (ops[i].am_opcode) {
617 case ATTR_OP_GET: 584 case ATTR_OP_GET:
618 ops[i].am_error = xfs_attrmulti_attr_get(inode, 585 ops[i].am_error = xfs_attrmulti_attr_get(
619 attr_name, ops[i].am_attrvalue, 586 dentry->d_inode, attr_name,
620 &ops[i].am_length, ops[i].am_flags); 587 ops[i].am_attrvalue, &ops[i].am_length,
588 ops[i].am_flags);
621 break; 589 break;
622 case ATTR_OP_SET: 590 case ATTR_OP_SET:
623 ops[i].am_error = mnt_want_write(parfilp->f_path.mnt); 591 ops[i].am_error = mnt_want_write(parfilp->f_path.mnt);
624 if (ops[i].am_error) 592 if (ops[i].am_error)
625 break; 593 break;
626 ops[i].am_error = xfs_attrmulti_attr_set(inode, 594 ops[i].am_error = xfs_attrmulti_attr_set(
627 attr_name, ops[i].am_attrvalue, 595 dentry->d_inode, attr_name,
628 ops[i].am_length, ops[i].am_flags); 596 ops[i].am_attrvalue, ops[i].am_length,
597 ops[i].am_flags);
629 mnt_drop_write(parfilp->f_path.mnt); 598 mnt_drop_write(parfilp->f_path.mnt);
630 break; 599 break;
631 case ATTR_OP_REMOVE: 600 case ATTR_OP_REMOVE:
632 ops[i].am_error = mnt_want_write(parfilp->f_path.mnt); 601 ops[i].am_error = mnt_want_write(parfilp->f_path.mnt);
633 if (ops[i].am_error) 602 if (ops[i].am_error)
634 break; 603 break;
635 ops[i].am_error = xfs_attrmulti_attr_remove(inode, 604 ops[i].am_error = xfs_attrmulti_attr_remove(
636 attr_name, ops[i].am_flags); 605 dentry->d_inode, attr_name,
606 ops[i].am_flags);
637 mnt_drop_write(parfilp->f_path.mnt); 607 mnt_drop_write(parfilp->f_path.mnt);
638 break; 608 break;
639 default: 609 default:
@@ -647,9 +617,8 @@ xfs_attrmulti_by_handle(
647 kfree(attr_name); 617 kfree(attr_name);
648 out_kfree_ops: 618 out_kfree_ops:
649 kfree(ops); 619 kfree(ops);
650 out_vn_rele: 620 out_dput:
651 iput(inode); 621 dput(dentry);
652 out:
653 return -error; 622 return -error;
654} 623}
655 624
@@ -1440,23 +1409,23 @@ xfs_file_ioctl(
1440 1409
1441 if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t))) 1410 if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t)))
1442 return -XFS_ERROR(EFAULT); 1411 return -XFS_ERROR(EFAULT);
1443 return xfs_open_by_handle(mp, &hreq, filp, inode); 1412 return xfs_open_by_handle(filp, &hreq);
1444 } 1413 }
1445 case XFS_IOC_FSSETDM_BY_HANDLE: 1414 case XFS_IOC_FSSETDM_BY_HANDLE:
1446 return xfs_fssetdm_by_handle(mp, arg, inode); 1415 return xfs_fssetdm_by_handle(filp, arg);
1447 1416
1448 case XFS_IOC_READLINK_BY_HANDLE: { 1417 case XFS_IOC_READLINK_BY_HANDLE: {
1449 xfs_fsop_handlereq_t hreq; 1418 xfs_fsop_handlereq_t hreq;
1450 1419
1451 if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t))) 1420 if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t)))
1452 return -XFS_ERROR(EFAULT); 1421 return -XFS_ERROR(EFAULT);
1453 return xfs_readlink_by_handle(mp, &hreq, inode); 1422 return xfs_readlink_by_handle(filp, &hreq);
1454 } 1423 }
1455 case XFS_IOC_ATTRLIST_BY_HANDLE: 1424 case XFS_IOC_ATTRLIST_BY_HANDLE:
1456 return xfs_attrlist_by_handle(mp, arg, inode); 1425 return xfs_attrlist_by_handle(filp, arg);
1457 1426
1458 case XFS_IOC_ATTRMULTI_BY_HANDLE: 1427 case XFS_IOC_ATTRMULTI_BY_HANDLE:
1459 return xfs_attrmulti_by_handle(mp, arg, filp, inode); 1428 return xfs_attrmulti_by_handle(filp, arg);
1460 1429
1461 case XFS_IOC_SWAPEXT: { 1430 case XFS_IOC_SWAPEXT: {
1462 struct xfs_swapext sxp; 1431 struct xfs_swapext sxp;
@@ -1546,21 +1515,6 @@ xfs_file_ioctl(
1546 return -error; 1515 return -error;
1547 } 1516 }
1548 1517
1549 case XFS_IOC_FREEZE:
1550 if (!capable(CAP_SYS_ADMIN))
1551 return -EPERM;
1552
1553 if (inode->i_sb->s_frozen == SB_UNFROZEN)
1554 freeze_bdev(inode->i_sb->s_bdev);
1555 return 0;
1556
1557 case XFS_IOC_THAW:
1558 if (!capable(CAP_SYS_ADMIN))
1559 return -EPERM;
1560 if (inode->i_sb->s_frozen != SB_UNFROZEN)
1561 thaw_bdev(inode->i_sb->s_bdev, inode->i_sb);
1562 return 0;
1563
1564 case XFS_IOC_GOINGDOWN: { 1518 case XFS_IOC_GOINGDOWN: {
1565 __uint32_t in; 1519 __uint32_t in;
1566 1520
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.h b/fs/xfs/linux-2.6/xfs_ioctl.h
index 8c16bf2d7e03..7bd7c6afc1eb 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.h
+++ b/fs/xfs/linux-2.6/xfs_ioctl.h
@@ -34,16 +34,13 @@ xfs_find_handle(
34 34
35extern int 35extern int
36xfs_open_by_handle( 36xfs_open_by_handle(
37 xfs_mount_t *mp,
38 xfs_fsop_handlereq_t *hreq,
39 struct file *parfilp, 37 struct file *parfilp,
40 struct inode *parinode); 38 xfs_fsop_handlereq_t *hreq);
41 39
42extern int 40extern int
43xfs_readlink_by_handle( 41xfs_readlink_by_handle(
44 xfs_mount_t *mp, 42 struct file *parfilp,
45 xfs_fsop_handlereq_t *hreq, 43 xfs_fsop_handlereq_t *hreq);
46 struct inode *parinode);
47 44
48extern int 45extern int
49xfs_attrmulti_attr_get( 46xfs_attrmulti_attr_get(
@@ -67,6 +64,12 @@ xfs_attrmulti_attr_remove(
67 char *name, 64 char *name,
68 __uint32_t flags); 65 __uint32_t flags);
69 66
67extern struct dentry *
68xfs_handle_to_dentry(
69 struct file *parfilp,
70 void __user *uhandle,
71 u32 hlen);
72
70extern long 73extern long
71xfs_file_ioctl( 74xfs_file_ioctl(
72 struct file *filp, 75 struct file *filp,
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index 0504cece9f66..c70c4e3db790 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -17,6 +17,7 @@
17 */ 17 */
18#include <linux/compat.h> 18#include <linux/compat.h>
19#include <linux/ioctl.h> 19#include <linux/ioctl.h>
20#include <linux/mount.h>
20#include <asm/uaccess.h> 21#include <asm/uaccess.h>
21#include "xfs.h" 22#include "xfs.h"
22#include "xfs_fs.h" 23#include "xfs_fs.h"
@@ -340,96 +341,24 @@ xfs_compat_handlereq_copyin(
340 return 0; 341 return 0;
341} 342}
342 343
343/* 344STATIC struct dentry *
344 * Convert userspace handle data into inode. 345xfs_compat_handlereq_to_dentry(
345 * 346 struct file *parfilp,
346 * We use the fact that all the fsop_handlereq ioctl calls have a data 347 compat_xfs_fsop_handlereq_t *hreq)
347 * structure argument whose first component is always a xfs_fsop_handlereq_t,
348 * so we can pass that sub structure into this handy, shared routine.
349 *
350 * If no error, caller must always iput the returned inode.
351 */
352STATIC int
353xfs_vget_fsop_handlereq_compat(
354 xfs_mount_t *mp,
355 struct inode *parinode, /* parent inode pointer */
356 compat_xfs_fsop_handlereq_t *hreq,
357 struct inode **inode)
358{ 348{
359 void __user *hanp; 349 return xfs_handle_to_dentry(parfilp,
360 size_t hlen; 350 compat_ptr(hreq->ihandle), hreq->ihandlen);
361 xfs_fid_t *xfid;
362 xfs_handle_t *handlep;
363 xfs_handle_t handle;
364 xfs_inode_t *ip;
365 xfs_ino_t ino;
366 __u32 igen;
367 int error;
368
369 /*
370 * Only allow handle opens under a directory.
371 */
372 if (!S_ISDIR(parinode->i_mode))
373 return XFS_ERROR(ENOTDIR);
374
375 hanp = compat_ptr(hreq->ihandle);
376 hlen = hreq->ihandlen;
377 handlep = &handle;
378
379 if (hlen < sizeof(handlep->ha_fsid) || hlen > sizeof(*handlep))
380 return XFS_ERROR(EINVAL);
381 if (copy_from_user(handlep, hanp, hlen))
382 return XFS_ERROR(EFAULT);
383 if (hlen < sizeof(*handlep))
384 memset(((char *)handlep) + hlen, 0, sizeof(*handlep) - hlen);
385 if (hlen > sizeof(handlep->ha_fsid)) {
386 if (handlep->ha_fid.fid_len !=
387 (hlen - sizeof(handlep->ha_fsid) -
388 sizeof(handlep->ha_fid.fid_len)) ||
389 handlep->ha_fid.fid_pad)
390 return XFS_ERROR(EINVAL);
391 }
392
393 /*
394 * Crack the handle, obtain the inode # & generation #
395 */
396 xfid = (struct xfs_fid *)&handlep->ha_fid;
397 if (xfid->fid_len == sizeof(*xfid) - sizeof(xfid->fid_len)) {
398 ino = xfid->fid_ino;
399 igen = xfid->fid_gen;
400 } else {
401 return XFS_ERROR(EINVAL);
402 }
403
404 /*
405 * Get the XFS inode, building a Linux inode to go with it.
406 */
407 error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_SHARED, &ip, 0);
408 if (error)
409 return error;
410 if (ip == NULL)
411 return XFS_ERROR(EIO);
412 if (ip->i_d.di_gen != igen) {
413 xfs_iput_new(ip, XFS_ILOCK_SHARED);
414 return XFS_ERROR(ENOENT);
415 }
416
417 xfs_iunlock(ip, XFS_ILOCK_SHARED);
418
419 *inode = VFS_I(ip);
420 return 0;
421} 351}
422 352
423STATIC int 353STATIC int
424xfs_compat_attrlist_by_handle( 354xfs_compat_attrlist_by_handle(
425 xfs_mount_t *mp, 355 struct file *parfilp,
426 void __user *arg, 356 void __user *arg)
427 struct inode *parinode)
428{ 357{
429 int error; 358 int error;
430 attrlist_cursor_kern_t *cursor; 359 attrlist_cursor_kern_t *cursor;
431 compat_xfs_fsop_attrlist_handlereq_t al_hreq; 360 compat_xfs_fsop_attrlist_handlereq_t al_hreq;
432 struct inode *inode; 361 struct dentry *dentry;
433 char *kbuf; 362 char *kbuf;
434 363
435 if (!capable(CAP_SYS_ADMIN)) 364 if (!capable(CAP_SYS_ADMIN))
@@ -446,17 +375,17 @@ xfs_compat_attrlist_by_handle(
446 if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE)) 375 if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE))
447 return -XFS_ERROR(EINVAL); 376 return -XFS_ERROR(EINVAL);
448 377
449 error = xfs_vget_fsop_handlereq_compat(mp, parinode, &al_hreq.hreq, 378 dentry = xfs_compat_handlereq_to_dentry(parfilp, &al_hreq.hreq);
450 &inode); 379 if (IS_ERR(dentry))
451 if (error) 380 return PTR_ERR(dentry);
452 goto out;
453 381
382 error = -ENOMEM;
454 kbuf = kmalloc(al_hreq.buflen, GFP_KERNEL); 383 kbuf = kmalloc(al_hreq.buflen, GFP_KERNEL);
455 if (!kbuf) 384 if (!kbuf)
456 goto out_vn_rele; 385 goto out_dput;
457 386
458 cursor = (attrlist_cursor_kern_t *)&al_hreq.pos; 387 cursor = (attrlist_cursor_kern_t *)&al_hreq.pos;
459 error = xfs_attr_list(XFS_I(inode), kbuf, al_hreq.buflen, 388 error = -xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen,
460 al_hreq.flags, cursor); 389 al_hreq.flags, cursor);
461 if (error) 390 if (error)
462 goto out_kfree; 391 goto out_kfree;
@@ -466,22 +395,20 @@ xfs_compat_attrlist_by_handle(
466 395
467 out_kfree: 396 out_kfree:
468 kfree(kbuf); 397 kfree(kbuf);
469 out_vn_rele: 398 out_dput:
470 iput(inode); 399 dput(dentry);
471 out: 400 return error;
472 return -error;
473} 401}
474 402
475STATIC int 403STATIC int
476xfs_compat_attrmulti_by_handle( 404xfs_compat_attrmulti_by_handle(
477 xfs_mount_t *mp, 405 struct file *parfilp,
478 void __user *arg, 406 void __user *arg)
479 struct inode *parinode)
480{ 407{
481 int error; 408 int error;
482 compat_xfs_attr_multiop_t *ops; 409 compat_xfs_attr_multiop_t *ops;
483 compat_xfs_fsop_attrmulti_handlereq_t am_hreq; 410 compat_xfs_fsop_attrmulti_handlereq_t am_hreq;
484 struct inode *inode; 411 struct dentry *dentry;
485 unsigned int i, size; 412 unsigned int i, size;
486 char *attr_name; 413 char *attr_name;
487 414
@@ -491,20 +418,19 @@ xfs_compat_attrmulti_by_handle(
491 sizeof(compat_xfs_fsop_attrmulti_handlereq_t))) 418 sizeof(compat_xfs_fsop_attrmulti_handlereq_t)))
492 return -XFS_ERROR(EFAULT); 419 return -XFS_ERROR(EFAULT);
493 420
494 error = xfs_vget_fsop_handlereq_compat(mp, parinode, &am_hreq.hreq, 421 dentry = xfs_compat_handlereq_to_dentry(parfilp, &am_hreq.hreq);
495 &inode); 422 if (IS_ERR(dentry))
496 if (error) 423 return PTR_ERR(dentry);
497 goto out;
498 424
499 error = E2BIG; 425 error = E2BIG;
500 size = am_hreq.opcount * sizeof(compat_xfs_attr_multiop_t); 426 size = am_hreq.opcount * sizeof(compat_xfs_attr_multiop_t);
501 if (!size || size > 16 * PAGE_SIZE) 427 if (!size || size > 16 * PAGE_SIZE)
502 goto out_vn_rele; 428 goto out_dput;
503 429
504 error = ENOMEM; 430 error = ENOMEM;
505 ops = kmalloc(size, GFP_KERNEL); 431 ops = kmalloc(size, GFP_KERNEL);
506 if (!ops) 432 if (!ops)
507 goto out_vn_rele; 433 goto out_dput;
508 434
509 error = EFAULT; 435 error = EFAULT;
510 if (copy_from_user(ops, compat_ptr(am_hreq.ops), size)) 436 if (copy_from_user(ops, compat_ptr(am_hreq.ops), size))
@@ -527,20 +453,29 @@ xfs_compat_attrmulti_by_handle(
527 453
528 switch (ops[i].am_opcode) { 454 switch (ops[i].am_opcode) {
529 case ATTR_OP_GET: 455 case ATTR_OP_GET:
530 ops[i].am_error = xfs_attrmulti_attr_get(inode, 456 ops[i].am_error = xfs_attrmulti_attr_get(
531 attr_name, 457 dentry->d_inode, attr_name,
532 compat_ptr(ops[i].am_attrvalue), 458 compat_ptr(ops[i].am_attrvalue),
533 &ops[i].am_length, ops[i].am_flags); 459 &ops[i].am_length, ops[i].am_flags);
534 break; 460 break;
535 case ATTR_OP_SET: 461 case ATTR_OP_SET:
536 ops[i].am_error = xfs_attrmulti_attr_set(inode, 462 ops[i].am_error = mnt_want_write(parfilp->f_path.mnt);
537 attr_name, 463 if (ops[i].am_error)
464 break;
465 ops[i].am_error = xfs_attrmulti_attr_set(
466 dentry->d_inode, attr_name,
538 compat_ptr(ops[i].am_attrvalue), 467 compat_ptr(ops[i].am_attrvalue),
539 ops[i].am_length, ops[i].am_flags); 468 ops[i].am_length, ops[i].am_flags);
469 mnt_drop_write(parfilp->f_path.mnt);
540 break; 470 break;
541 case ATTR_OP_REMOVE: 471 case ATTR_OP_REMOVE:
542 ops[i].am_error = xfs_attrmulti_attr_remove(inode, 472 ops[i].am_error = mnt_want_write(parfilp->f_path.mnt);
543 attr_name, ops[i].am_flags); 473 if (ops[i].am_error)
474 break;
475 ops[i].am_error = xfs_attrmulti_attr_remove(
476 dentry->d_inode, attr_name,
477 ops[i].am_flags);
478 mnt_drop_write(parfilp->f_path.mnt);
544 break; 479 break;
545 default: 480 default:
546 ops[i].am_error = EINVAL; 481 ops[i].am_error = EINVAL;
@@ -553,22 +488,20 @@ xfs_compat_attrmulti_by_handle(
553 kfree(attr_name); 488 kfree(attr_name);
554 out_kfree_ops: 489 out_kfree_ops:
555 kfree(ops); 490 kfree(ops);
556 out_vn_rele: 491 out_dput:
557 iput(inode); 492 dput(dentry);
558 out:
559 return -error; 493 return -error;
560} 494}
561 495
562STATIC int 496STATIC int
563xfs_compat_fssetdm_by_handle( 497xfs_compat_fssetdm_by_handle(
564 xfs_mount_t *mp, 498 struct file *parfilp,
565 void __user *arg, 499 void __user *arg)
566 struct inode *parinode)
567{ 500{
568 int error; 501 int error;
569 struct fsdmidata fsd; 502 struct fsdmidata fsd;
570 compat_xfs_fsop_setdm_handlereq_t dmhreq; 503 compat_xfs_fsop_setdm_handlereq_t dmhreq;
571 struct inode *inode; 504 struct dentry *dentry;
572 505
573 if (!capable(CAP_MKNOD)) 506 if (!capable(CAP_MKNOD))
574 return -XFS_ERROR(EPERM); 507 return -XFS_ERROR(EPERM);
@@ -576,12 +509,11 @@ xfs_compat_fssetdm_by_handle(
576 sizeof(compat_xfs_fsop_setdm_handlereq_t))) 509 sizeof(compat_xfs_fsop_setdm_handlereq_t)))
577 return -XFS_ERROR(EFAULT); 510 return -XFS_ERROR(EFAULT);
578 511
579 error = xfs_vget_fsop_handlereq_compat(mp, parinode, &dmhreq.hreq, 512 dentry = xfs_compat_handlereq_to_dentry(parfilp, &dmhreq.hreq);
580 &inode); 513 if (IS_ERR(dentry))
581 if (error) 514 return PTR_ERR(dentry);
582 return -error;
583 515
584 if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) { 516 if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode)) {
585 error = -XFS_ERROR(EPERM); 517 error = -XFS_ERROR(EPERM);
586 goto out; 518 goto out;
587 } 519 }
@@ -591,11 +523,11 @@ xfs_compat_fssetdm_by_handle(
591 goto out; 523 goto out;
592 } 524 }
593 525
594 error = -xfs_set_dmattrs(XFS_I(inode), fsd.fsd_dmevmask, 526 error = -xfs_set_dmattrs(XFS_I(dentry->d_inode), fsd.fsd_dmevmask,
595 fsd.fsd_dmstate); 527 fsd.fsd_dmstate);
596 528
597out: 529out:
598 iput(inode); 530 dput(dentry);
599 return error; 531 return error;
600} 532}
601 533
@@ -632,8 +564,6 @@ xfs_file_compat_ioctl(
632 case XFS_IOC_SET_RESBLKS: 564 case XFS_IOC_SET_RESBLKS:
633 case XFS_IOC_GET_RESBLKS: 565 case XFS_IOC_GET_RESBLKS:
634 case XFS_IOC_FSGROWFSLOG: 566 case XFS_IOC_FSGROWFSLOG:
635 case XFS_IOC_FREEZE:
636 case XFS_IOC_THAW:
637 case XFS_IOC_GOINGDOWN: 567 case XFS_IOC_GOINGDOWN:
638 case XFS_IOC_ERROR_INJECTION: 568 case XFS_IOC_ERROR_INJECTION:
639 case XFS_IOC_ERROR_CLEARALL: 569 case XFS_IOC_ERROR_CLEARALL:
@@ -724,21 +654,21 @@ xfs_file_compat_ioctl(
724 654
725 if (xfs_compat_handlereq_copyin(&hreq, arg)) 655 if (xfs_compat_handlereq_copyin(&hreq, arg))
726 return -XFS_ERROR(EFAULT); 656 return -XFS_ERROR(EFAULT);
727 return xfs_open_by_handle(mp, &hreq, filp, inode); 657 return xfs_open_by_handle(filp, &hreq);
728 } 658 }
729 case XFS_IOC_READLINK_BY_HANDLE_32: { 659 case XFS_IOC_READLINK_BY_HANDLE_32: {
730 struct xfs_fsop_handlereq hreq; 660 struct xfs_fsop_handlereq hreq;
731 661
732 if (xfs_compat_handlereq_copyin(&hreq, arg)) 662 if (xfs_compat_handlereq_copyin(&hreq, arg))
733 return -XFS_ERROR(EFAULT); 663 return -XFS_ERROR(EFAULT);
734 return xfs_readlink_by_handle(mp, &hreq, inode); 664 return xfs_readlink_by_handle(filp, &hreq);
735 } 665 }
736 case XFS_IOC_ATTRLIST_BY_HANDLE_32: 666 case XFS_IOC_ATTRLIST_BY_HANDLE_32:
737 return xfs_compat_attrlist_by_handle(mp, arg, inode); 667 return xfs_compat_attrlist_by_handle(filp, arg);
738 case XFS_IOC_ATTRMULTI_BY_HANDLE_32: 668 case XFS_IOC_ATTRMULTI_BY_HANDLE_32:
739 return xfs_compat_attrmulti_by_handle(mp, arg, inode); 669 return xfs_compat_attrmulti_by_handle(filp, arg);
740 case XFS_IOC_FSSETDM_BY_HANDLE_32: 670 case XFS_IOC_FSSETDM_BY_HANDLE_32:
741 return xfs_compat_fssetdm_by_handle(mp, arg, inode); 671 return xfs_compat_fssetdm_by_handle(filp, arg);
742 default: 672 default:
743 return -XFS_ERROR(ENOIOCTLCMD); 673 return -XFS_ERROR(ENOIOCTLCMD);
744 } 674 }
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 36f6cc703ef2..c71e226da7f5 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1197,6 +1197,7 @@ xfs_fs_remount(
1197 struct xfs_mount *mp = XFS_M(sb); 1197 struct xfs_mount *mp = XFS_M(sb);
1198 substring_t args[MAX_OPT_ARGS]; 1198 substring_t args[MAX_OPT_ARGS];
1199 char *p; 1199 char *p;
1200 int error;
1200 1201
1201 while ((p = strsep(&options, ",")) != NULL) { 1202 while ((p = strsep(&options, ",")) != NULL) {
1202 int token; 1203 int token;
@@ -1247,11 +1248,25 @@ xfs_fs_remount(
1247 } 1248 }
1248 } 1249 }
1249 1250
1250 /* rw/ro -> rw */ 1251 /* ro -> rw */
1251 if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & MS_RDONLY)) { 1252 if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & MS_RDONLY)) {
1252 mp->m_flags &= ~XFS_MOUNT_RDONLY; 1253 mp->m_flags &= ~XFS_MOUNT_RDONLY;
1253 if (mp->m_flags & XFS_MOUNT_BARRIER) 1254 if (mp->m_flags & XFS_MOUNT_BARRIER)
1254 xfs_mountfs_check_barriers(mp); 1255 xfs_mountfs_check_barriers(mp);
1256
1257 /*
1258 * If this is the first remount to writeable state we
1259 * might have some superblock changes to update.
1260 */
1261 if (mp->m_update_flags) {
1262 error = xfs_mount_log_sb(mp, mp->m_update_flags);
1263 if (error) {
1264 cmn_err(CE_WARN,
1265 "XFS: failed to write sb changes");
1266 return error;
1267 }
1268 mp->m_update_flags = 0;
1269 }
1255 } 1270 }
1256 1271
1257 /* rw -> ro */ 1272 /* rw -> ro */
@@ -1269,14 +1284,14 @@ xfs_fs_remount(
1269 * need to take care of the metadata. Once that's done write a dummy 1284 * need to take care of the metadata. Once that's done write a dummy
1270 * record to dirty the log in case of a crash while frozen. 1285 * record to dirty the log in case of a crash while frozen.
1271 */ 1286 */
1272STATIC void 1287STATIC int
1273xfs_fs_lockfs( 1288xfs_fs_freeze(
1274 struct super_block *sb) 1289 struct super_block *sb)
1275{ 1290{
1276 struct xfs_mount *mp = XFS_M(sb); 1291 struct xfs_mount *mp = XFS_M(sb);
1277 1292
1278 xfs_quiesce_attr(mp); 1293 xfs_quiesce_attr(mp);
1279 xfs_fs_log_dummy(mp); 1294 return -xfs_fs_log_dummy(mp);
1280} 1295}
1281 1296
1282STATIC int 1297STATIC int
@@ -1348,7 +1363,7 @@ xfs_finish_flags(
1348{ 1363{
1349 int ronly = (mp->m_flags & XFS_MOUNT_RDONLY); 1364 int ronly = (mp->m_flags & XFS_MOUNT_RDONLY);
1350 1365
1351 /* Fail a mount where the logbuf is smaller then the log stripe */ 1366 /* Fail a mount where the logbuf is smaller than the log stripe */
1352 if (xfs_sb_version_haslogv2(&mp->m_sb)) { 1367 if (xfs_sb_version_haslogv2(&mp->m_sb)) {
1353 if (mp->m_logbsize <= 0 && 1368 if (mp->m_logbsize <= 0 &&
1354 mp->m_sb.sb_logsunit > XLOG_BIG_RECORD_BSIZE) { 1369 mp->m_sb.sb_logsunit > XLOG_BIG_RECORD_BSIZE) {
@@ -1557,7 +1572,7 @@ static struct super_operations xfs_super_operations = {
1557 .put_super = xfs_fs_put_super, 1572 .put_super = xfs_fs_put_super,
1558 .write_super = xfs_fs_write_super, 1573 .write_super = xfs_fs_write_super,
1559 .sync_fs = xfs_fs_sync_super, 1574 .sync_fs = xfs_fs_sync_super,
1560 .write_super_lockfs = xfs_fs_lockfs, 1575 .freeze_fs = xfs_fs_freeze,
1561 .statfs = xfs_fs_statfs, 1576 .statfs = xfs_fs_statfs,
1562 .remount_fs = xfs_fs_remount, 1577 .remount_fs = xfs_fs_remount,
1563 .show_options = xfs_fs_show_options, 1578 .show_options = xfs_fs_show_options,
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 2ed035354c26..a608e72fa405 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -371,7 +371,11 @@ xfs_quiesce_attr(
371 /* flush inodes and push all remaining buffers out to disk */ 371 /* flush inodes and push all remaining buffers out to disk */
372 xfs_quiesce_fs(mp); 372 xfs_quiesce_fs(mp);
373 373
374 ASSERT_ALWAYS(atomic_read(&mp->m_active_trans) == 0); 374 /*
375 * Just warn here till VFS can correctly support
376 * read-only remount without racing.
377 */
378 WARN_ON(atomic_read(&mp->m_active_trans) != 0);
375 379
376 /* Push the superblock and write an unmount record */ 380 /* Push the superblock and write an unmount record */
377 error = xfs_log_sbcount(mp, 1); 381 error = xfs_log_sbcount(mp, 1);
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index 591ca6602bfb..6543c0b29753 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -73,6 +73,8 @@ int xfs_dqreq_num;
73int xfs_dqerror_mod = 33; 73int xfs_dqerror_mod = 33;
74#endif 74#endif
75 75
76static struct lock_class_key xfs_dquot_other_class;
77
76/* 78/*
77 * Allocate and initialize a dquot. We don't always allocate fresh memory; 79 * Allocate and initialize a dquot. We don't always allocate fresh memory;
78 * we try to reclaim a free dquot if the number of incore dquots are above 80 * we try to reclaim a free dquot if the number of incore dquots are above
@@ -139,7 +141,15 @@ xfs_qm_dqinit(
139 ASSERT(dqp->q_trace); 141 ASSERT(dqp->q_trace);
140 xfs_dqtrace_entry(dqp, "DQRECLAIMED_INIT"); 142 xfs_dqtrace_entry(dqp, "DQRECLAIMED_INIT");
141#endif 143#endif
142 } 144 }
145
146 /*
147 * In either case we need to make sure group quotas have a different
148 * lock class than user quotas, to make sure lockdep knows we can
149 * locks of one of each at the same time.
150 */
151 if (!(type & XFS_DQ_USER))
152 lockdep_set_class(&dqp->q_qlock, &xfs_dquot_other_class);
143 153
144 /* 154 /*
145 * log item gets initialized later 155 * log item gets initialized later
@@ -421,7 +431,7 @@ xfs_qm_dqalloc(
421 /* 431 /*
422 * Initialize the bmap freelist prior to calling bmapi code. 432 * Initialize the bmap freelist prior to calling bmapi code.
423 */ 433 */
424 XFS_BMAP_INIT(&flist, &firstblock); 434 xfs_bmap_init(&flist, &firstblock);
425 xfs_ilock(quotip, XFS_ILOCK_EXCL); 435 xfs_ilock(quotip, XFS_ILOCK_EXCL);
426 /* 436 /*
427 * Return if this type of quotas is turned off while we didn't 437 * Return if this type of quotas is turned off while we didn't
@@ -1383,6 +1393,12 @@ xfs_dqunlock_nonotify(
1383 mutex_unlock(&(dqp->q_qlock)); 1393 mutex_unlock(&(dqp->q_qlock));
1384} 1394}
1385 1395
1396/*
1397 * Lock two xfs_dquot structures.
1398 *
1399 * To avoid deadlocks we always lock the quota structure with
1400 * the lowerd id first.
1401 */
1386void 1402void
1387xfs_dqlock2( 1403xfs_dqlock2(
1388 xfs_dquot_t *d1, 1404 xfs_dquot_t *d1,
@@ -1392,18 +1408,16 @@ xfs_dqlock2(
1392 ASSERT(d1 != d2); 1408 ASSERT(d1 != d2);
1393 if (be32_to_cpu(d1->q_core.d_id) > 1409 if (be32_to_cpu(d1->q_core.d_id) >
1394 be32_to_cpu(d2->q_core.d_id)) { 1410 be32_to_cpu(d2->q_core.d_id)) {
1395 xfs_dqlock(d2); 1411 mutex_lock(&d2->q_qlock);
1396 xfs_dqlock(d1); 1412 mutex_lock_nested(&d1->q_qlock, XFS_QLOCK_NESTED);
1397 } else { 1413 } else {
1398 xfs_dqlock(d1); 1414 mutex_lock(&d1->q_qlock);
1399 xfs_dqlock(d2); 1415 mutex_lock_nested(&d2->q_qlock, XFS_QLOCK_NESTED);
1400 }
1401 } else {
1402 if (d1) {
1403 xfs_dqlock(d1);
1404 } else if (d2) {
1405 xfs_dqlock(d2);
1406 } 1416 }
1417 } else if (d1) {
1418 mutex_lock(&d1->q_qlock);
1419 } else if (d2) {
1420 mutex_lock(&d2->q_qlock);
1407 } 1421 }
1408} 1422}
1409 1423
diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h
index 7e455337e2ba..d443e93b4331 100644
--- a/fs/xfs/quota/xfs_dquot.h
+++ b/fs/xfs/quota/xfs_dquot.h
@@ -97,6 +97,16 @@ typedef struct xfs_dquot {
97#define dq_hashlist q_lists.dqm_hashlist 97#define dq_hashlist q_lists.dqm_hashlist
98#define dq_flags q_lists.dqm_flags 98#define dq_flags q_lists.dqm_flags
99 99
100/*
101 * Lock hierachy for q_qlock:
102 * XFS_QLOCK_NORMAL is the implicit default,
103 * XFS_QLOCK_NESTED is the dquot with the higher id in xfs_dqlock2
104 */
105enum {
106 XFS_QLOCK_NORMAL = 0,
107 XFS_QLOCK_NESTED,
108};
109
100#define XFS_DQHOLD(dqp) ((dqp)->q_nrefs++) 110#define XFS_DQHOLD(dqp) ((dqp)->q_nrefs++)
101 111
102#ifdef DEBUG 112#ifdef DEBUG
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 6b13960cf318..7a2beb64314f 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -1070,6 +1070,13 @@ xfs_qm_sync(
1070 return 0; 1070 return 0;
1071} 1071}
1072 1072
1073/*
1074 * The hash chains and the mplist use the same xfs_dqhash structure as
1075 * their list head, but we can take the mplist qh_lock and one of the
1076 * hash qh_locks at the same time without any problem as they aren't
1077 * related.
1078 */
1079static struct lock_class_key xfs_quota_mplist_class;
1073 1080
1074/* 1081/*
1075 * This initializes all the quota information that's kept in the 1082 * This initializes all the quota information that's kept in the
@@ -1105,6 +1112,8 @@ xfs_qm_init_quotainfo(
1105 } 1112 }
1106 1113
1107 xfs_qm_list_init(&qinf->qi_dqlist, "mpdqlist", 0); 1114 xfs_qm_list_init(&qinf->qi_dqlist, "mpdqlist", 0);
1115 lockdep_set_class(&qinf->qi_dqlist.qh_lock, &xfs_quota_mplist_class);
1116
1108 qinf->qi_dqreclaims = 0; 1117 qinf->qi_dqreclaims = 0;
1109 1118
1110 /* mutex used to serialize quotaoffs */ 1119 /* mutex used to serialize quotaoffs */
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index a4e293b93efa..642f1db4def4 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -22,7 +22,6 @@
22 * Access Control Lists 22 * Access Control Lists
23 */ 23 */
24typedef __uint16_t xfs_acl_perm_t; 24typedef __uint16_t xfs_acl_perm_t;
25typedef __int32_t xfs_acl_type_t;
26typedef __int32_t xfs_acl_tag_t; 25typedef __int32_t xfs_acl_tag_t;
27typedef __int32_t xfs_acl_id_t; 26typedef __int32_t xfs_acl_id_t;
28 27
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index f2e21817a226..143d63ecb20a 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -231,7 +231,7 @@ typedef struct xfs_perag
231#define XFS_FSB_TO_AGNO(mp,fsbno) \ 231#define XFS_FSB_TO_AGNO(mp,fsbno) \
232 ((xfs_agnumber_t)((fsbno) >> (mp)->m_sb.sb_agblklog)) 232 ((xfs_agnumber_t)((fsbno) >> (mp)->m_sb.sb_agblklog))
233#define XFS_FSB_TO_AGBNO(mp,fsbno) \ 233#define XFS_FSB_TO_AGBNO(mp,fsbno) \
234 ((xfs_agblock_t)((fsbno) & XFS_MASK32LO((mp)->m_sb.sb_agblklog))) 234 ((xfs_agblock_t)((fsbno) & xfs_mask32lo((mp)->m_sb.sb_agblklog)))
235#define XFS_AGB_TO_DADDR(mp,agno,agbno) \ 235#define XFS_AGB_TO_DADDR(mp,agno,agbno) \
236 ((xfs_daddr_t)XFS_FSB_TO_BB(mp, \ 236 ((xfs_daddr_t)XFS_FSB_TO_BB(mp, \
237 (xfs_fsblock_t)(agno) * (mp)->m_sb.sb_agblocks + (agbno))) 237 (xfs_fsblock_t)(agno) * (mp)->m_sb.sb_agblocks + (agbno)))
@@ -244,8 +244,8 @@ typedef struct xfs_perag
244#define XFS_AG_CHECK_DADDR(mp,d,len) \ 244#define XFS_AG_CHECK_DADDR(mp,d,len) \
245 ((len) == 1 ? \ 245 ((len) == 1 ? \
246 ASSERT((d) == XFS_SB_DADDR || \ 246 ASSERT((d) == XFS_SB_DADDR || \
247 XFS_DADDR_TO_AGBNO(mp, d) != XFS_SB_DADDR) : \ 247 xfs_daddr_to_agbno(mp, d) != XFS_SB_DADDR) : \
248 ASSERT(XFS_DADDR_TO_AGNO(mp, d) == \ 248 ASSERT(xfs_daddr_to_agno(mp, d) == \
249 XFS_DADDR_TO_AGNO(mp, (d) + (len) - 1))) 249 xfs_daddr_to_agno(mp, (d) + (len) - 1)))
250 250
251#endif /* __XFS_AG_H__ */ 251#endif /* __XFS_AG_H__ */
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index 733cb75a8c5d..c10c3a292d30 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -115,7 +115,7 @@ xfs_allocbt_free_block(
115 xfs_agblock_t bno; 115 xfs_agblock_t bno;
116 int error; 116 int error;
117 117
118 bno = XFS_DADDR_TO_AGBNO(cur->bc_mp, XFS_BUF_ADDR(bp)); 118 bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp));
119 error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1); 119 error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1);
120 if (error) 120 if (error)
121 return error; 121 return error;
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index f7cdc28aff41..5fde1654b430 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -374,7 +374,7 @@ xfs_attr_set_int(xfs_inode_t *dp, struct xfs_name *name,
374 * It won't fit in the shortform, transform to a leaf block. 374 * It won't fit in the shortform, transform to a leaf block.
375 * GROT: another possible req'mt for a double-split btree op. 375 * GROT: another possible req'mt for a double-split btree op.
376 */ 376 */
377 XFS_BMAP_INIT(args.flist, args.firstblock); 377 xfs_bmap_init(args.flist, args.firstblock);
378 error = xfs_attr_shortform_to_leaf(&args); 378 error = xfs_attr_shortform_to_leaf(&args);
379 if (!error) { 379 if (!error) {
380 error = xfs_bmap_finish(&args.trans, args.flist, 380 error = xfs_bmap_finish(&args.trans, args.flist,
@@ -956,7 +956,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
956 * Commit that transaction so that the node_addname() call 956 * Commit that transaction so that the node_addname() call
957 * can manage its own transactions. 957 * can manage its own transactions.
958 */ 958 */
959 XFS_BMAP_INIT(args->flist, args->firstblock); 959 xfs_bmap_init(args->flist, args->firstblock);
960 error = xfs_attr_leaf_to_node(args); 960 error = xfs_attr_leaf_to_node(args);
961 if (!error) { 961 if (!error) {
962 error = xfs_bmap_finish(&args->trans, args->flist, 962 error = xfs_bmap_finish(&args->trans, args->flist,
@@ -1057,7 +1057,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
1057 * If the result is small enough, shrink it all into the inode. 1057 * If the result is small enough, shrink it all into the inode.
1058 */ 1058 */
1059 if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { 1059 if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
1060 XFS_BMAP_INIT(args->flist, args->firstblock); 1060 xfs_bmap_init(args->flist, args->firstblock);
1061 error = xfs_attr_leaf_to_shortform(bp, args, forkoff); 1061 error = xfs_attr_leaf_to_shortform(bp, args, forkoff);
1062 /* bp is gone due to xfs_da_shrink_inode */ 1062 /* bp is gone due to xfs_da_shrink_inode */
1063 if (!error) { 1063 if (!error) {
@@ -1135,7 +1135,7 @@ xfs_attr_leaf_removename(xfs_da_args_t *args)
1135 * If the result is small enough, shrink it all into the inode. 1135 * If the result is small enough, shrink it all into the inode.
1136 */ 1136 */
1137 if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { 1137 if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
1138 XFS_BMAP_INIT(args->flist, args->firstblock); 1138 xfs_bmap_init(args->flist, args->firstblock);
1139 error = xfs_attr_leaf_to_shortform(bp, args, forkoff); 1139 error = xfs_attr_leaf_to_shortform(bp, args, forkoff);
1140 /* bp is gone due to xfs_da_shrink_inode */ 1140 /* bp is gone due to xfs_da_shrink_inode */
1141 if (!error) { 1141 if (!error) {
@@ -1290,7 +1290,7 @@ restart:
1290 * have been a b-tree. 1290 * have been a b-tree.
1291 */ 1291 */
1292 xfs_da_state_free(state); 1292 xfs_da_state_free(state);
1293 XFS_BMAP_INIT(args->flist, args->firstblock); 1293 xfs_bmap_init(args->flist, args->firstblock);
1294 error = xfs_attr_leaf_to_node(args); 1294 error = xfs_attr_leaf_to_node(args);
1295 if (!error) { 1295 if (!error) {
1296 error = xfs_bmap_finish(&args->trans, 1296 error = xfs_bmap_finish(&args->trans,
@@ -1331,7 +1331,7 @@ restart:
1331 * in the index/blkno/rmtblkno/rmtblkcnt fields and 1331 * in the index/blkno/rmtblkno/rmtblkcnt fields and
1332 * in the index2/blkno2/rmtblkno2/rmtblkcnt2 fields. 1332 * in the index2/blkno2/rmtblkno2/rmtblkcnt2 fields.
1333 */ 1333 */
1334 XFS_BMAP_INIT(args->flist, args->firstblock); 1334 xfs_bmap_init(args->flist, args->firstblock);
1335 error = xfs_da_split(state); 1335 error = xfs_da_split(state);
1336 if (!error) { 1336 if (!error) {
1337 error = xfs_bmap_finish(&args->trans, args->flist, 1337 error = xfs_bmap_finish(&args->trans, args->flist,
@@ -1443,7 +1443,7 @@ restart:
1443 * Check to see if the tree needs to be collapsed. 1443 * Check to see if the tree needs to be collapsed.
1444 */ 1444 */
1445 if (retval && (state->path.active > 1)) { 1445 if (retval && (state->path.active > 1)) {
1446 XFS_BMAP_INIT(args->flist, args->firstblock); 1446 xfs_bmap_init(args->flist, args->firstblock);
1447 error = xfs_da_join(state); 1447 error = xfs_da_join(state);
1448 if (!error) { 1448 if (!error) {
1449 error = xfs_bmap_finish(&args->trans, 1449 error = xfs_bmap_finish(&args->trans,
@@ -1579,7 +1579,7 @@ xfs_attr_node_removename(xfs_da_args_t *args)
1579 * Check to see if the tree needs to be collapsed. 1579 * Check to see if the tree needs to be collapsed.
1580 */ 1580 */
1581 if (retval && (state->path.active > 1)) { 1581 if (retval && (state->path.active > 1)) {
1582 XFS_BMAP_INIT(args->flist, args->firstblock); 1582 xfs_bmap_init(args->flist, args->firstblock);
1583 error = xfs_da_join(state); 1583 error = xfs_da_join(state);
1584 if (!error) { 1584 if (!error) {
1585 error = xfs_bmap_finish(&args->trans, args->flist, 1585 error = xfs_bmap_finish(&args->trans, args->flist,
@@ -1630,7 +1630,7 @@ xfs_attr_node_removename(xfs_da_args_t *args)
1630 == XFS_ATTR_LEAF_MAGIC); 1630 == XFS_ATTR_LEAF_MAGIC);
1631 1631
1632 if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { 1632 if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
1633 XFS_BMAP_INIT(args->flist, args->firstblock); 1633 xfs_bmap_init(args->flist, args->firstblock);
1634 error = xfs_attr_leaf_to_shortform(bp, args, forkoff); 1634 error = xfs_attr_leaf_to_shortform(bp, args, forkoff);
1635 /* bp is gone due to xfs_da_shrink_inode */ 1635 /* bp is gone due to xfs_da_shrink_inode */
1636 if (!error) { 1636 if (!error) {
@@ -2069,7 +2069,7 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
2069 /* 2069 /*
2070 * Allocate a single extent, up to the size of the value. 2070 * Allocate a single extent, up to the size of the value.
2071 */ 2071 */
2072 XFS_BMAP_INIT(args->flist, args->firstblock); 2072 xfs_bmap_init(args->flist, args->firstblock);
2073 nmap = 1; 2073 nmap = 1;
2074 error = xfs_bmapi(args->trans, dp, (xfs_fileoff_t)lblkno, 2074 error = xfs_bmapi(args->trans, dp, (xfs_fileoff_t)lblkno,
2075 blkcnt, 2075 blkcnt,
@@ -2123,7 +2123,7 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
2123 /* 2123 /*
2124 * Try to remember where we decided to put the value. 2124 * Try to remember where we decided to put the value.
2125 */ 2125 */
2126 XFS_BMAP_INIT(args->flist, args->firstblock); 2126 xfs_bmap_init(args->flist, args->firstblock);
2127 nmap = 1; 2127 nmap = 1;
2128 error = xfs_bmapi(NULL, dp, (xfs_fileoff_t)lblkno, 2128 error = xfs_bmapi(NULL, dp, (xfs_fileoff_t)lblkno,
2129 args->rmtblkcnt, 2129 args->rmtblkcnt,
@@ -2188,7 +2188,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
2188 /* 2188 /*
2189 * Try to remember where we decided to put the value. 2189 * Try to remember where we decided to put the value.
2190 */ 2190 */
2191 XFS_BMAP_INIT(args->flist, args->firstblock); 2191 xfs_bmap_init(args->flist, args->firstblock);
2192 nmap = 1; 2192 nmap = 1;
2193 error = xfs_bmapi(NULL, args->dp, (xfs_fileoff_t)lblkno, 2193 error = xfs_bmapi(NULL, args->dp, (xfs_fileoff_t)lblkno,
2194 args->rmtblkcnt, 2194 args->rmtblkcnt,
@@ -2229,7 +2229,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
2229 blkcnt = args->rmtblkcnt; 2229 blkcnt = args->rmtblkcnt;
2230 done = 0; 2230 done = 0;
2231 while (!done) { 2231 while (!done) {
2232 XFS_BMAP_INIT(args->flist, args->firstblock); 2232 xfs_bmap_init(args->flist, args->firstblock);
2233 error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt, 2233 error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt,
2234 XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, 2234 XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
2235 1, args->firstblock, args->flist, 2235 1, args->firstblock, args->flist,
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index 79da6b2ea99e..6c323f8a4cd1 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -736,7 +736,7 @@ xfs_attr_shortform_allfit(xfs_dabuf_t *bp, xfs_inode_t *dp)
736 continue; /* don't copy partial entries */ 736 continue; /* don't copy partial entries */
737 if (!(entry->flags & XFS_ATTR_LOCAL)) 737 if (!(entry->flags & XFS_ATTR_LOCAL))
738 return(0); 738 return(0);
739 name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, i); 739 name_loc = xfs_attr_leaf_name_local(leaf, i);
740 if (name_loc->namelen >= XFS_ATTR_SF_ENTSIZE_MAX) 740 if (name_loc->namelen >= XFS_ATTR_SF_ENTSIZE_MAX)
741 return(0); 741 return(0);
742 if (be16_to_cpu(name_loc->valuelen) >= XFS_ATTR_SF_ENTSIZE_MAX) 742 if (be16_to_cpu(name_loc->valuelen) >= XFS_ATTR_SF_ENTSIZE_MAX)
@@ -823,7 +823,7 @@ xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args, int forkoff)
823 if (!entry->nameidx) 823 if (!entry->nameidx)
824 continue; 824 continue;
825 ASSERT(entry->flags & XFS_ATTR_LOCAL); 825 ASSERT(entry->flags & XFS_ATTR_LOCAL);
826 name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, i); 826 name_loc = xfs_attr_leaf_name_local(leaf, i);
827 nargs.name = (char *)name_loc->nameval; 827 nargs.name = (char *)name_loc->nameval;
828 nargs.namelen = name_loc->namelen; 828 nargs.namelen = name_loc->namelen;
829 nargs.value = (char *)&name_loc->nameval[nargs.namelen]; 829 nargs.value = (char *)&name_loc->nameval[nargs.namelen];
@@ -1141,14 +1141,14 @@ xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex)
1141 * as part of this transaction (a split operation for example). 1141 * as part of this transaction (a split operation for example).
1142 */ 1142 */
1143 if (entry->flags & XFS_ATTR_LOCAL) { 1143 if (entry->flags & XFS_ATTR_LOCAL) {
1144 name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, args->index); 1144 name_loc = xfs_attr_leaf_name_local(leaf, args->index);
1145 name_loc->namelen = args->namelen; 1145 name_loc->namelen = args->namelen;
1146 name_loc->valuelen = cpu_to_be16(args->valuelen); 1146 name_loc->valuelen = cpu_to_be16(args->valuelen);
1147 memcpy((char *)name_loc->nameval, args->name, args->namelen); 1147 memcpy((char *)name_loc->nameval, args->name, args->namelen);
1148 memcpy((char *)&name_loc->nameval[args->namelen], args->value, 1148 memcpy((char *)&name_loc->nameval[args->namelen], args->value,
1149 be16_to_cpu(name_loc->valuelen)); 1149 be16_to_cpu(name_loc->valuelen));
1150 } else { 1150 } else {
1151 name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, args->index); 1151 name_rmt = xfs_attr_leaf_name_remote(leaf, args->index);
1152 name_rmt->namelen = args->namelen; 1152 name_rmt->namelen = args->namelen;
1153 memcpy((char *)name_rmt->name, args->name, args->namelen); 1153 memcpy((char *)name_rmt->name, args->name, args->namelen);
1154 entry->flags |= XFS_ATTR_INCOMPLETE; 1154 entry->flags |= XFS_ATTR_INCOMPLETE;
@@ -1159,7 +1159,7 @@ xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex)
1159 args->rmtblkcnt = XFS_B_TO_FSB(mp, args->valuelen); 1159 args->rmtblkcnt = XFS_B_TO_FSB(mp, args->valuelen);
1160 } 1160 }
1161 xfs_da_log_buf(args->trans, bp, 1161 xfs_da_log_buf(args->trans, bp,
1162 XFS_DA_LOGRANGE(leaf, XFS_ATTR_LEAF_NAME(leaf, args->index), 1162 XFS_DA_LOGRANGE(leaf, xfs_attr_leaf_name(leaf, args->index),
1163 xfs_attr_leaf_entsize(leaf, args->index))); 1163 xfs_attr_leaf_entsize(leaf, args->index)));
1164 1164
1165 /* 1165 /*
@@ -1749,10 +1749,10 @@ xfs_attr_leaf_remove(xfs_dabuf_t *bp, xfs_da_args_t *args)
1749 /* 1749 /*
1750 * Compress the remaining entries and zero out the removed stuff. 1750 * Compress the remaining entries and zero out the removed stuff.
1751 */ 1751 */
1752 memset(XFS_ATTR_LEAF_NAME(leaf, args->index), 0, entsize); 1752 memset(xfs_attr_leaf_name(leaf, args->index), 0, entsize);
1753 be16_add_cpu(&hdr->usedbytes, -entsize); 1753 be16_add_cpu(&hdr->usedbytes, -entsize);
1754 xfs_da_log_buf(args->trans, bp, 1754 xfs_da_log_buf(args->trans, bp,
1755 XFS_DA_LOGRANGE(leaf, XFS_ATTR_LEAF_NAME(leaf, args->index), 1755 XFS_DA_LOGRANGE(leaf, xfs_attr_leaf_name(leaf, args->index),
1756 entsize)); 1756 entsize));
1757 1757
1758 tmp = (be16_to_cpu(hdr->count) - args->index) 1758 tmp = (be16_to_cpu(hdr->count) - args->index)
@@ -1985,7 +1985,7 @@ xfs_attr_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args)
1985 continue; 1985 continue;
1986 } 1986 }
1987 if (entry->flags & XFS_ATTR_LOCAL) { 1987 if (entry->flags & XFS_ATTR_LOCAL) {
1988 name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, probe); 1988 name_loc = xfs_attr_leaf_name_local(leaf, probe);
1989 if (name_loc->namelen != args->namelen) 1989 if (name_loc->namelen != args->namelen)
1990 continue; 1990 continue;
1991 if (memcmp(args->name, (char *)name_loc->nameval, args->namelen) != 0) 1991 if (memcmp(args->name, (char *)name_loc->nameval, args->namelen) != 0)
@@ -1995,7 +1995,7 @@ xfs_attr_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args)
1995 args->index = probe; 1995 args->index = probe;
1996 return(XFS_ERROR(EEXIST)); 1996 return(XFS_ERROR(EEXIST));
1997 } else { 1997 } else {
1998 name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, probe); 1998 name_rmt = xfs_attr_leaf_name_remote(leaf, probe);
1999 if (name_rmt->namelen != args->namelen) 1999 if (name_rmt->namelen != args->namelen)
2000 continue; 2000 continue;
2001 if (memcmp(args->name, (char *)name_rmt->name, 2001 if (memcmp(args->name, (char *)name_rmt->name,
@@ -2035,7 +2035,7 @@ xfs_attr_leaf_getvalue(xfs_dabuf_t *bp, xfs_da_args_t *args)
2035 2035
2036 entry = &leaf->entries[args->index]; 2036 entry = &leaf->entries[args->index];
2037 if (entry->flags & XFS_ATTR_LOCAL) { 2037 if (entry->flags & XFS_ATTR_LOCAL) {
2038 name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, args->index); 2038 name_loc = xfs_attr_leaf_name_local(leaf, args->index);
2039 ASSERT(name_loc->namelen == args->namelen); 2039 ASSERT(name_loc->namelen == args->namelen);
2040 ASSERT(memcmp(args->name, name_loc->nameval, args->namelen) == 0); 2040 ASSERT(memcmp(args->name, name_loc->nameval, args->namelen) == 0);
2041 valuelen = be16_to_cpu(name_loc->valuelen); 2041 valuelen = be16_to_cpu(name_loc->valuelen);
@@ -2050,7 +2050,7 @@ xfs_attr_leaf_getvalue(xfs_dabuf_t *bp, xfs_da_args_t *args)
2050 args->valuelen = valuelen; 2050 args->valuelen = valuelen;
2051 memcpy(args->value, &name_loc->nameval[args->namelen], valuelen); 2051 memcpy(args->value, &name_loc->nameval[args->namelen], valuelen);
2052 } else { 2052 } else {
2053 name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, args->index); 2053 name_rmt = xfs_attr_leaf_name_remote(leaf, args->index);
2054 ASSERT(name_rmt->namelen == args->namelen); 2054 ASSERT(name_rmt->namelen == args->namelen);
2055 ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0); 2055 ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0);
2056 valuelen = be32_to_cpu(name_rmt->valuelen); 2056 valuelen = be32_to_cpu(name_rmt->valuelen);
@@ -2143,7 +2143,7 @@ xfs_attr_leaf_moveents(xfs_attr_leafblock_t *leaf_s, int start_s,
2143 * off for 6.2, should be revisited later. 2143 * off for 6.2, should be revisited later.
2144 */ 2144 */
2145 if (entry_s->flags & XFS_ATTR_INCOMPLETE) { /* skip partials? */ 2145 if (entry_s->flags & XFS_ATTR_INCOMPLETE) { /* skip partials? */
2146 memset(XFS_ATTR_LEAF_NAME(leaf_s, start_s + i), 0, tmp); 2146 memset(xfs_attr_leaf_name(leaf_s, start_s + i), 0, tmp);
2147 be16_add_cpu(&hdr_s->usedbytes, -tmp); 2147 be16_add_cpu(&hdr_s->usedbytes, -tmp);
2148 be16_add_cpu(&hdr_s->count, -1); 2148 be16_add_cpu(&hdr_s->count, -1);
2149 entry_d--; /* to compensate for ++ in loop hdr */ 2149 entry_d--; /* to compensate for ++ in loop hdr */
@@ -2160,11 +2160,11 @@ xfs_attr_leaf_moveents(xfs_attr_leafblock_t *leaf_s, int start_s,
2160 entry_d->flags = entry_s->flags; 2160 entry_d->flags = entry_s->flags;
2161 ASSERT(be16_to_cpu(entry_d->nameidx) + tmp 2161 ASSERT(be16_to_cpu(entry_d->nameidx) + tmp
2162 <= XFS_LBSIZE(mp)); 2162 <= XFS_LBSIZE(mp));
2163 memmove(XFS_ATTR_LEAF_NAME(leaf_d, desti), 2163 memmove(xfs_attr_leaf_name(leaf_d, desti),
2164 XFS_ATTR_LEAF_NAME(leaf_s, start_s + i), tmp); 2164 xfs_attr_leaf_name(leaf_s, start_s + i), tmp);
2165 ASSERT(be16_to_cpu(entry_s->nameidx) + tmp 2165 ASSERT(be16_to_cpu(entry_s->nameidx) + tmp
2166 <= XFS_LBSIZE(mp)); 2166 <= XFS_LBSIZE(mp));
2167 memset(XFS_ATTR_LEAF_NAME(leaf_s, start_s + i), 0, tmp); 2167 memset(xfs_attr_leaf_name(leaf_s, start_s + i), 0, tmp);
2168 be16_add_cpu(&hdr_s->usedbytes, -tmp); 2168 be16_add_cpu(&hdr_s->usedbytes, -tmp);
2169 be16_add_cpu(&hdr_d->usedbytes, tmp); 2169 be16_add_cpu(&hdr_d->usedbytes, tmp);
2170 be16_add_cpu(&hdr_s->count, -1); 2170 be16_add_cpu(&hdr_s->count, -1);
@@ -2276,12 +2276,12 @@ xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index)
2276 2276
2277 ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); 2277 ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC);
2278 if (leaf->entries[index].flags & XFS_ATTR_LOCAL) { 2278 if (leaf->entries[index].flags & XFS_ATTR_LOCAL) {
2279 name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, index); 2279 name_loc = xfs_attr_leaf_name_local(leaf, index);
2280 size = XFS_ATTR_LEAF_ENTSIZE_LOCAL(name_loc->namelen, 2280 size = xfs_attr_leaf_entsize_local(name_loc->namelen,
2281 be16_to_cpu(name_loc->valuelen)); 2281 be16_to_cpu(name_loc->valuelen));
2282 } else { 2282 } else {
2283 name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, index); 2283 name_rmt = xfs_attr_leaf_name_remote(leaf, index);
2284 size = XFS_ATTR_LEAF_ENTSIZE_REMOTE(name_rmt->namelen); 2284 size = xfs_attr_leaf_entsize_remote(name_rmt->namelen);
2285 } 2285 }
2286 return(size); 2286 return(size);
2287} 2287}
@@ -2297,13 +2297,13 @@ xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize, int *local)
2297{ 2297{
2298 int size; 2298 int size;
2299 2299
2300 size = XFS_ATTR_LEAF_ENTSIZE_LOCAL(namelen, valuelen); 2300 size = xfs_attr_leaf_entsize_local(namelen, valuelen);
2301 if (size < XFS_ATTR_LEAF_ENTSIZE_LOCAL_MAX(blocksize)) { 2301 if (size < xfs_attr_leaf_entsize_local_max(blocksize)) {
2302 if (local) { 2302 if (local) {
2303 *local = 1; 2303 *local = 1;
2304 } 2304 }
2305 } else { 2305 } else {
2306 size = XFS_ATTR_LEAF_ENTSIZE_REMOTE(namelen); 2306 size = xfs_attr_leaf_entsize_remote(namelen);
2307 if (local) { 2307 if (local) {
2308 *local = 0; 2308 *local = 0;
2309 } 2309 }
@@ -2372,7 +2372,7 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
2372 2372
2373 if (entry->flags & XFS_ATTR_LOCAL) { 2373 if (entry->flags & XFS_ATTR_LOCAL) {
2374 xfs_attr_leaf_name_local_t *name_loc = 2374 xfs_attr_leaf_name_local_t *name_loc =
2375 XFS_ATTR_LEAF_NAME_LOCAL(leaf, i); 2375 xfs_attr_leaf_name_local(leaf, i);
2376 2376
2377 retval = context->put_listent(context, 2377 retval = context->put_listent(context,
2378 entry->flags, 2378 entry->flags,
@@ -2384,7 +2384,7 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
2384 return retval; 2384 return retval;
2385 } else { 2385 } else {
2386 xfs_attr_leaf_name_remote_t *name_rmt = 2386 xfs_attr_leaf_name_remote_t *name_rmt =
2387 XFS_ATTR_LEAF_NAME_REMOTE(leaf, i); 2387 xfs_attr_leaf_name_remote(leaf, i);
2388 2388
2389 int valuelen = be32_to_cpu(name_rmt->valuelen); 2389 int valuelen = be32_to_cpu(name_rmt->valuelen);
2390 2390
@@ -2468,11 +2468,11 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args)
2468 2468
2469#ifdef DEBUG 2469#ifdef DEBUG
2470 if (entry->flags & XFS_ATTR_LOCAL) { 2470 if (entry->flags & XFS_ATTR_LOCAL) {
2471 name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, args->index); 2471 name_loc = xfs_attr_leaf_name_local(leaf, args->index);
2472 namelen = name_loc->namelen; 2472 namelen = name_loc->namelen;
2473 name = (char *)name_loc->nameval; 2473 name = (char *)name_loc->nameval;
2474 } else { 2474 } else {
2475 name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, args->index); 2475 name_rmt = xfs_attr_leaf_name_remote(leaf, args->index);
2476 namelen = name_rmt->namelen; 2476 namelen = name_rmt->namelen;
2477 name = (char *)name_rmt->name; 2477 name = (char *)name_rmt->name;
2478 } 2478 }
@@ -2487,7 +2487,7 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args)
2487 2487
2488 if (args->rmtblkno) { 2488 if (args->rmtblkno) {
2489 ASSERT((entry->flags & XFS_ATTR_LOCAL) == 0); 2489 ASSERT((entry->flags & XFS_ATTR_LOCAL) == 0);
2490 name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, args->index); 2490 name_rmt = xfs_attr_leaf_name_remote(leaf, args->index);
2491 name_rmt->valueblk = cpu_to_be32(args->rmtblkno); 2491 name_rmt->valueblk = cpu_to_be32(args->rmtblkno);
2492 name_rmt->valuelen = cpu_to_be32(args->valuelen); 2492 name_rmt->valuelen = cpu_to_be32(args->valuelen);
2493 xfs_da_log_buf(args->trans, bp, 2493 xfs_da_log_buf(args->trans, bp,
@@ -2534,7 +2534,7 @@ xfs_attr_leaf_setflag(xfs_da_args_t *args)
2534 xfs_da_log_buf(args->trans, bp, 2534 xfs_da_log_buf(args->trans, bp,
2535 XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry))); 2535 XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
2536 if ((entry->flags & XFS_ATTR_LOCAL) == 0) { 2536 if ((entry->flags & XFS_ATTR_LOCAL) == 0) {
2537 name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, args->index); 2537 name_rmt = xfs_attr_leaf_name_remote(leaf, args->index);
2538 name_rmt->valueblk = 0; 2538 name_rmt->valueblk = 0;
2539 name_rmt->valuelen = 0; 2539 name_rmt->valuelen = 0;
2540 xfs_da_log_buf(args->trans, bp, 2540 xfs_da_log_buf(args->trans, bp,
@@ -2607,20 +2607,20 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args)
2607 2607
2608#ifdef DEBUG 2608#ifdef DEBUG
2609 if (entry1->flags & XFS_ATTR_LOCAL) { 2609 if (entry1->flags & XFS_ATTR_LOCAL) {
2610 name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf1, args->index); 2610 name_loc = xfs_attr_leaf_name_local(leaf1, args->index);
2611 namelen1 = name_loc->namelen; 2611 namelen1 = name_loc->namelen;
2612 name1 = (char *)name_loc->nameval; 2612 name1 = (char *)name_loc->nameval;
2613 } else { 2613 } else {
2614 name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf1, args->index); 2614 name_rmt = xfs_attr_leaf_name_remote(leaf1, args->index);
2615 namelen1 = name_rmt->namelen; 2615 namelen1 = name_rmt->namelen;
2616 name1 = (char *)name_rmt->name; 2616 name1 = (char *)name_rmt->name;
2617 } 2617 }
2618 if (entry2->flags & XFS_ATTR_LOCAL) { 2618 if (entry2->flags & XFS_ATTR_LOCAL) {
2619 name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf2, args->index2); 2619 name_loc = xfs_attr_leaf_name_local(leaf2, args->index2);
2620 namelen2 = name_loc->namelen; 2620 namelen2 = name_loc->namelen;
2621 name2 = (char *)name_loc->nameval; 2621 name2 = (char *)name_loc->nameval;
2622 } else { 2622 } else {
2623 name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf2, args->index2); 2623 name_rmt = xfs_attr_leaf_name_remote(leaf2, args->index2);
2624 namelen2 = name_rmt->namelen; 2624 namelen2 = name_rmt->namelen;
2625 name2 = (char *)name_rmt->name; 2625 name2 = (char *)name_rmt->name;
2626 } 2626 }
@@ -2637,7 +2637,7 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args)
2637 XFS_DA_LOGRANGE(leaf1, entry1, sizeof(*entry1))); 2637 XFS_DA_LOGRANGE(leaf1, entry1, sizeof(*entry1)));
2638 if (args->rmtblkno) { 2638 if (args->rmtblkno) {
2639 ASSERT((entry1->flags & XFS_ATTR_LOCAL) == 0); 2639 ASSERT((entry1->flags & XFS_ATTR_LOCAL) == 0);
2640 name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf1, args->index); 2640 name_rmt = xfs_attr_leaf_name_remote(leaf1, args->index);
2641 name_rmt->valueblk = cpu_to_be32(args->rmtblkno); 2641 name_rmt->valueblk = cpu_to_be32(args->rmtblkno);
2642 name_rmt->valuelen = cpu_to_be32(args->valuelen); 2642 name_rmt->valuelen = cpu_to_be32(args->valuelen);
2643 xfs_da_log_buf(args->trans, bp1, 2643 xfs_da_log_buf(args->trans, bp1,
@@ -2648,7 +2648,7 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args)
2648 xfs_da_log_buf(args->trans, bp2, 2648 xfs_da_log_buf(args->trans, bp2,
2649 XFS_DA_LOGRANGE(leaf2, entry2, sizeof(*entry2))); 2649 XFS_DA_LOGRANGE(leaf2, entry2, sizeof(*entry2)));
2650 if ((entry2->flags & XFS_ATTR_LOCAL) == 0) { 2650 if ((entry2->flags & XFS_ATTR_LOCAL) == 0) {
2651 name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf2, args->index2); 2651 name_rmt = xfs_attr_leaf_name_remote(leaf2, args->index2);
2652 name_rmt->valueblk = 0; 2652 name_rmt->valueblk = 0;
2653 name_rmt->valuelen = 0; 2653 name_rmt->valuelen = 0;
2654 xfs_da_log_buf(args->trans, bp2, 2654 xfs_da_log_buf(args->trans, bp2,
@@ -2855,7 +2855,7 @@ xfs_attr_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp)
2855 for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) { 2855 for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) {
2856 if (be16_to_cpu(entry->nameidx) && 2856 if (be16_to_cpu(entry->nameidx) &&
2857 ((entry->flags & XFS_ATTR_LOCAL) == 0)) { 2857 ((entry->flags & XFS_ATTR_LOCAL) == 0)) {
2858 name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, i); 2858 name_rmt = xfs_attr_leaf_name_remote(leaf, i);
2859 if (name_rmt->valueblk) 2859 if (name_rmt->valueblk)
2860 count++; 2860 count++;
2861 } 2861 }
@@ -2883,7 +2883,7 @@ xfs_attr_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp)
2883 for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) { 2883 for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) {
2884 if (be16_to_cpu(entry->nameidx) && 2884 if (be16_to_cpu(entry->nameidx) &&
2885 ((entry->flags & XFS_ATTR_LOCAL) == 0)) { 2885 ((entry->flags & XFS_ATTR_LOCAL) == 0)) {
2886 name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, i); 2886 name_rmt = xfs_attr_leaf_name_remote(leaf, i);
2887 if (name_rmt->valueblk) { 2887 if (name_rmt->valueblk) {
2888 lp->valueblk = be32_to_cpu(name_rmt->valueblk); 2888 lp->valueblk = be32_to_cpu(name_rmt->valueblk);
2889 lp->valuelen = XFS_B_TO_FSB(dp->i_mount, 2889 lp->valuelen = XFS_B_TO_FSB(dp->i_mount,
diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h
index 83e9af417ca2..9c7d22fdcf4d 100644
--- a/fs/xfs/xfs_attr_leaf.h
+++ b/fs/xfs/xfs_attr_leaf.h
@@ -151,8 +151,6 @@ typedef struct xfs_attr_leafblock {
151/* 151/*
152 * Cast typed pointers for "local" and "remote" name/value structs. 152 * Cast typed pointers for "local" and "remote" name/value structs.
153 */ 153 */
154#define XFS_ATTR_LEAF_NAME_REMOTE(leafp,idx) \
155 xfs_attr_leaf_name_remote(leafp,idx)
156static inline xfs_attr_leaf_name_remote_t * 154static inline xfs_attr_leaf_name_remote_t *
157xfs_attr_leaf_name_remote(xfs_attr_leafblock_t *leafp, int idx) 155xfs_attr_leaf_name_remote(xfs_attr_leafblock_t *leafp, int idx)
158{ 156{
@@ -160,8 +158,6 @@ xfs_attr_leaf_name_remote(xfs_attr_leafblock_t *leafp, int idx)
160 &((char *)leafp)[be16_to_cpu(leafp->entries[idx].nameidx)]; 158 &((char *)leafp)[be16_to_cpu(leafp->entries[idx].nameidx)];
161} 159}
162 160
163#define XFS_ATTR_LEAF_NAME_LOCAL(leafp,idx) \
164 xfs_attr_leaf_name_local(leafp,idx)
165static inline xfs_attr_leaf_name_local_t * 161static inline xfs_attr_leaf_name_local_t *
166xfs_attr_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx) 162xfs_attr_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx)
167{ 163{
@@ -169,8 +165,6 @@ xfs_attr_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx)
169 &((char *)leafp)[be16_to_cpu(leafp->entries[idx].nameidx)]; 165 &((char *)leafp)[be16_to_cpu(leafp->entries[idx].nameidx)];
170} 166}
171 167
172#define XFS_ATTR_LEAF_NAME(leafp,idx) \
173 xfs_attr_leaf_name(leafp,idx)
174static inline char *xfs_attr_leaf_name(xfs_attr_leafblock_t *leafp, int idx) 168static inline char *xfs_attr_leaf_name(xfs_attr_leafblock_t *leafp, int idx)
175{ 169{
176 return &((char *)leafp)[be16_to_cpu(leafp->entries[idx].nameidx)]; 170 return &((char *)leafp)[be16_to_cpu(leafp->entries[idx].nameidx)];
@@ -181,24 +175,18 @@ static inline char *xfs_attr_leaf_name(xfs_attr_leafblock_t *leafp, int idx)
181 * a "local" name/value structure, a "remote" name/value structure, and 175 * a "local" name/value structure, a "remote" name/value structure, and
182 * a pointer which might be either. 176 * a pointer which might be either.
183 */ 177 */
184#define XFS_ATTR_LEAF_ENTSIZE_REMOTE(nlen) \
185 xfs_attr_leaf_entsize_remote(nlen)
186static inline int xfs_attr_leaf_entsize_remote(int nlen) 178static inline int xfs_attr_leaf_entsize_remote(int nlen)
187{ 179{
188 return ((uint)sizeof(xfs_attr_leaf_name_remote_t) - 1 + (nlen) + \ 180 return ((uint)sizeof(xfs_attr_leaf_name_remote_t) - 1 + (nlen) + \
189 XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1); 181 XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1);
190} 182}
191 183
192#define XFS_ATTR_LEAF_ENTSIZE_LOCAL(nlen,vlen) \
193 xfs_attr_leaf_entsize_local(nlen,vlen)
194static inline int xfs_attr_leaf_entsize_local(int nlen, int vlen) 184static inline int xfs_attr_leaf_entsize_local(int nlen, int vlen)
195{ 185{
196 return ((uint)sizeof(xfs_attr_leaf_name_local_t) - 1 + (nlen) + (vlen) + 186 return ((uint)sizeof(xfs_attr_leaf_name_local_t) - 1 + (nlen) + (vlen) +
197 XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1); 187 XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1);
198} 188}
199 189
200#define XFS_ATTR_LEAF_ENTSIZE_LOCAL_MAX(bsize) \
201 xfs_attr_leaf_entsize_local_max(bsize)
202static inline int xfs_attr_leaf_entsize_local_max(int bsize) 190static inline int xfs_attr_leaf_entsize_local_max(int bsize)
203{ 191{
204 return (((bsize) >> 1) + ((bsize) >> 2)); 192 return (((bsize) >> 1) + ((bsize) >> 2));
diff --git a/fs/xfs/xfs_bit.h b/fs/xfs/xfs_bit.h
index bca7b243c319..f1e3c907044d 100644
--- a/fs/xfs/xfs_bit.h
+++ b/fs/xfs/xfs_bit.h
@@ -23,24 +23,16 @@
23 */ 23 */
24 24
25/* 25/*
26 * masks with n high/low bits set, 32-bit values & 64-bit values 26 * masks with n high/low bits set, 64-bit values
27 */ 27 */
28#define XFS_MASK32HI(n) xfs_mask32hi(n)
29static inline __uint32_t xfs_mask32hi(int n)
30{
31 return (__uint32_t)-1 << (32 - (n));
32}
33#define XFS_MASK64HI(n) xfs_mask64hi(n)
34static inline __uint64_t xfs_mask64hi(int n) 28static inline __uint64_t xfs_mask64hi(int n)
35{ 29{
36 return (__uint64_t)-1 << (64 - (n)); 30 return (__uint64_t)-1 << (64 - (n));
37} 31}
38#define XFS_MASK32LO(n) xfs_mask32lo(n)
39static inline __uint32_t xfs_mask32lo(int n) 32static inline __uint32_t xfs_mask32lo(int n)
40{ 33{
41 return ((__uint32_t)1 << (n)) - 1; 34 return ((__uint32_t)1 << (n)) - 1;
42} 35}
43#define XFS_MASK64LO(n) xfs_mask64lo(n)
44static inline __uint64_t xfs_mask64lo(int n) 36static inline __uint64_t xfs_mask64lo(int n)
45{ 37{
46 return ((__uint64_t)1 << (n)) - 1; 38 return ((__uint64_t)1 << (n)) - 1;
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 138308e70d14..c852cd65aaea 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -595,9 +595,9 @@ xfs_bmap_add_extent(
595 xfs_iext_insert(ifp, 0, 1, new); 595 xfs_iext_insert(ifp, 0, 1, new);
596 ASSERT(cur == NULL); 596 ASSERT(cur == NULL);
597 ifp->if_lastex = 0; 597 ifp->if_lastex = 0;
598 if (!ISNULLSTARTBLOCK(new->br_startblock)) { 598 if (!isnullstartblock(new->br_startblock)) {
599 XFS_IFORK_NEXT_SET(ip, whichfork, 1); 599 XFS_IFORK_NEXT_SET(ip, whichfork, 1);
600 logflags = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork); 600 logflags = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
601 } else 601 } else
602 logflags = 0; 602 logflags = 0;
603 /* DELTA: single new extent */ 603 /* DELTA: single new extent */
@@ -613,7 +613,7 @@ xfs_bmap_add_extent(
613 /* 613 /*
614 * Any kind of new delayed allocation goes here. 614 * Any kind of new delayed allocation goes here.
615 */ 615 */
616 else if (ISNULLSTARTBLOCK(new->br_startblock)) { 616 else if (isnullstartblock(new->br_startblock)) {
617 if (cur) 617 if (cur)
618 ASSERT((cur->bc_private.b.flags & 618 ASSERT((cur->bc_private.b.flags &
619 XFS_BTCUR_BPRV_WASDEL) == 0); 619 XFS_BTCUR_BPRV_WASDEL) == 0);
@@ -644,11 +644,11 @@ xfs_bmap_add_extent(
644 * in a delayed or unwritten allocation with a real one, or 644 * in a delayed or unwritten allocation with a real one, or
645 * converting real back to unwritten. 645 * converting real back to unwritten.
646 */ 646 */
647 if (!ISNULLSTARTBLOCK(new->br_startblock) && 647 if (!isnullstartblock(new->br_startblock) &&
648 new->br_startoff + new->br_blockcount > prev.br_startoff) { 648 new->br_startoff + new->br_blockcount > prev.br_startoff) {
649 if (prev.br_state != XFS_EXT_UNWRITTEN && 649 if (prev.br_state != XFS_EXT_UNWRITTEN &&
650 ISNULLSTARTBLOCK(prev.br_startblock)) { 650 isnullstartblock(prev.br_startblock)) {
651 da_old = STARTBLOCKVAL(prev.br_startblock); 651 da_old = startblockval(prev.br_startblock);
652 if (cur) 652 if (cur)
653 ASSERT(cur->bc_private.b.flags & 653 ASSERT(cur->bc_private.b.flags &
654 XFS_BTCUR_BPRV_WASDEL); 654 XFS_BTCUR_BPRV_WASDEL);
@@ -803,7 +803,7 @@ xfs_bmap_add_extent_delay_real(
803 */ 803 */
804 if (STATE_SET_TEST(LEFT_VALID, idx > 0)) { 804 if (STATE_SET_TEST(LEFT_VALID, idx > 0)) {
805 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &LEFT); 805 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &LEFT);
806 STATE_SET(LEFT_DELAY, ISNULLSTARTBLOCK(LEFT.br_startblock)); 806 STATE_SET(LEFT_DELAY, isnullstartblock(LEFT.br_startblock));
807 } 807 }
808 STATE_SET(LEFT_CONTIG, 808 STATE_SET(LEFT_CONTIG,
809 STATE_TEST(LEFT_VALID) && !STATE_TEST(LEFT_DELAY) && 809 STATE_TEST(LEFT_VALID) && !STATE_TEST(LEFT_DELAY) &&
@@ -820,7 +820,7 @@ xfs_bmap_add_extent_delay_real(
820 idx < 820 idx <
821 ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1)) { 821 ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1)) {
822 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx + 1), &RIGHT); 822 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx + 1), &RIGHT);
823 STATE_SET(RIGHT_DELAY, ISNULLSTARTBLOCK(RIGHT.br_startblock)); 823 STATE_SET(RIGHT_DELAY, isnullstartblock(RIGHT.br_startblock));
824 } 824 }
825 STATE_SET(RIGHT_CONTIG, 825 STATE_SET(RIGHT_CONTIG,
826 STATE_TEST(RIGHT_VALID) && !STATE_TEST(RIGHT_DELAY) && 826 STATE_TEST(RIGHT_VALID) && !STATE_TEST(RIGHT_DELAY) &&
@@ -1019,8 +1019,8 @@ xfs_bmap_add_extent_delay_real(
1019 goto done; 1019 goto done;
1020 } 1020 }
1021 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), 1021 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
1022 STARTBLOCKVAL(PREV.br_startblock)); 1022 startblockval(PREV.br_startblock));
1023 xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); 1023 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
1024 XFS_BMAP_TRACE_POST_UPDATE("LF|LC", ip, idx, XFS_DATA_FORK); 1024 XFS_BMAP_TRACE_POST_UPDATE("LF|LC", ip, idx, XFS_DATA_FORK);
1025 *dnew = temp; 1025 *dnew = temp;
1026 /* DELTA: The boundary between two in-core extents moved. */ 1026 /* DELTA: The boundary between two in-core extents moved. */
@@ -1067,10 +1067,10 @@ xfs_bmap_add_extent_delay_real(
1067 goto done; 1067 goto done;
1068 } 1068 }
1069 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), 1069 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
1070 STARTBLOCKVAL(PREV.br_startblock) - 1070 startblockval(PREV.br_startblock) -
1071 (cur ? cur->bc_private.b.allocated : 0)); 1071 (cur ? cur->bc_private.b.allocated : 0));
1072 ep = xfs_iext_get_ext(ifp, idx + 1); 1072 ep = xfs_iext_get_ext(ifp, idx + 1);
1073 xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); 1073 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
1074 XFS_BMAP_TRACE_POST_UPDATE("LF", ip, idx + 1, XFS_DATA_FORK); 1074 XFS_BMAP_TRACE_POST_UPDATE("LF", ip, idx + 1, XFS_DATA_FORK);
1075 *dnew = temp; 1075 *dnew = temp;
1076 /* DELTA: One in-core extent is split in two. */ 1076 /* DELTA: One in-core extent is split in two. */
@@ -1110,8 +1110,8 @@ xfs_bmap_add_extent_delay_real(
1110 goto done; 1110 goto done;
1111 } 1111 }
1112 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), 1112 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
1113 STARTBLOCKVAL(PREV.br_startblock)); 1113 startblockval(PREV.br_startblock));
1114 xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); 1114 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
1115 XFS_BMAP_TRACE_POST_UPDATE("RF|RC", ip, idx, XFS_DATA_FORK); 1115 XFS_BMAP_TRACE_POST_UPDATE("RF|RC", ip, idx, XFS_DATA_FORK);
1116 *dnew = temp; 1116 *dnew = temp;
1117 /* DELTA: The boundary between two in-core extents moved. */ 1117 /* DELTA: The boundary between two in-core extents moved. */
@@ -1157,10 +1157,10 @@ xfs_bmap_add_extent_delay_real(
1157 goto done; 1157 goto done;
1158 } 1158 }
1159 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), 1159 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
1160 STARTBLOCKVAL(PREV.br_startblock) - 1160 startblockval(PREV.br_startblock) -
1161 (cur ? cur->bc_private.b.allocated : 0)); 1161 (cur ? cur->bc_private.b.allocated : 0));
1162 ep = xfs_iext_get_ext(ifp, idx); 1162 ep = xfs_iext_get_ext(ifp, idx);
1163 xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); 1163 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
1164 XFS_BMAP_TRACE_POST_UPDATE("RF", ip, idx, XFS_DATA_FORK); 1164 XFS_BMAP_TRACE_POST_UPDATE("RF", ip, idx, XFS_DATA_FORK);
1165 *dnew = temp; 1165 *dnew = temp;
1166 /* DELTA: One in-core extent is split in two. */ 1166 /* DELTA: One in-core extent is split in two. */
@@ -1213,7 +1213,7 @@ xfs_bmap_add_extent_delay_real(
1213 } 1213 }
1214 temp = xfs_bmap_worst_indlen(ip, temp); 1214 temp = xfs_bmap_worst_indlen(ip, temp);
1215 temp2 = xfs_bmap_worst_indlen(ip, temp2); 1215 temp2 = xfs_bmap_worst_indlen(ip, temp2);
1216 diff = (int)(temp + temp2 - STARTBLOCKVAL(PREV.br_startblock) - 1216 diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) -
1217 (cur ? cur->bc_private.b.allocated : 0)); 1217 (cur ? cur->bc_private.b.allocated : 0));
1218 if (diff > 0 && 1218 if (diff > 0 &&
1219 xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS, -((int64_t)diff), rsvd)) { 1219 xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS, -((int64_t)diff), rsvd)) {
@@ -1241,11 +1241,11 @@ xfs_bmap_add_extent_delay_real(
1241 } 1241 }
1242 } 1242 }
1243 ep = xfs_iext_get_ext(ifp, idx); 1243 ep = xfs_iext_get_ext(ifp, idx);
1244 xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); 1244 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
1245 XFS_BMAP_TRACE_POST_UPDATE("0", ip, idx, XFS_DATA_FORK); 1245 XFS_BMAP_TRACE_POST_UPDATE("0", ip, idx, XFS_DATA_FORK);
1246 XFS_BMAP_TRACE_PRE_UPDATE("0", ip, idx + 2, XFS_DATA_FORK); 1246 XFS_BMAP_TRACE_PRE_UPDATE("0", ip, idx + 2, XFS_DATA_FORK);
1247 xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx + 2), 1247 xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx + 2),
1248 NULLSTARTBLOCK((int)temp2)); 1248 nullstartblock((int)temp2));
1249 XFS_BMAP_TRACE_POST_UPDATE("0", ip, idx + 2, XFS_DATA_FORK); 1249 XFS_BMAP_TRACE_POST_UPDATE("0", ip, idx + 2, XFS_DATA_FORK);
1250 *dnew = temp + temp2; 1250 *dnew = temp + temp2;
1251 /* DELTA: One in-core extent is split in three. */ 1251 /* DELTA: One in-core extent is split in three. */
@@ -1365,7 +1365,7 @@ xfs_bmap_add_extent_unwritten_real(
1365 */ 1365 */
1366 if (STATE_SET_TEST(LEFT_VALID, idx > 0)) { 1366 if (STATE_SET_TEST(LEFT_VALID, idx > 0)) {
1367 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &LEFT); 1367 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &LEFT);
1368 STATE_SET(LEFT_DELAY, ISNULLSTARTBLOCK(LEFT.br_startblock)); 1368 STATE_SET(LEFT_DELAY, isnullstartblock(LEFT.br_startblock));
1369 } 1369 }
1370 STATE_SET(LEFT_CONTIG, 1370 STATE_SET(LEFT_CONTIG,
1371 STATE_TEST(LEFT_VALID) && !STATE_TEST(LEFT_DELAY) && 1371 STATE_TEST(LEFT_VALID) && !STATE_TEST(LEFT_DELAY) &&
@@ -1382,7 +1382,7 @@ xfs_bmap_add_extent_unwritten_real(
1382 idx < 1382 idx <
1383 ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1)) { 1383 ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1)) {
1384 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx + 1), &RIGHT); 1384 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx + 1), &RIGHT);
1385 STATE_SET(RIGHT_DELAY, ISNULLSTARTBLOCK(RIGHT.br_startblock)); 1385 STATE_SET(RIGHT_DELAY, isnullstartblock(RIGHT.br_startblock));
1386 } 1386 }
1387 STATE_SET(RIGHT_CONTIG, 1387 STATE_SET(RIGHT_CONTIG,
1388 STATE_TEST(RIGHT_VALID) && !STATE_TEST(RIGHT_DELAY) && 1388 STATE_TEST(RIGHT_VALID) && !STATE_TEST(RIGHT_DELAY) &&
@@ -1889,13 +1889,13 @@ xfs_bmap_add_extent_hole_delay(
1889 ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); 1889 ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
1890 ep = xfs_iext_get_ext(ifp, idx); 1890 ep = xfs_iext_get_ext(ifp, idx);
1891 state = 0; 1891 state = 0;
1892 ASSERT(ISNULLSTARTBLOCK(new->br_startblock)); 1892 ASSERT(isnullstartblock(new->br_startblock));
1893 /* 1893 /*
1894 * Check and set flags if this segment has a left neighbor 1894 * Check and set flags if this segment has a left neighbor
1895 */ 1895 */
1896 if (STATE_SET_TEST(LEFT_VALID, idx > 0)) { 1896 if (STATE_SET_TEST(LEFT_VALID, idx > 0)) {
1897 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &left); 1897 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &left);
1898 STATE_SET(LEFT_DELAY, ISNULLSTARTBLOCK(left.br_startblock)); 1898 STATE_SET(LEFT_DELAY, isnullstartblock(left.br_startblock));
1899 } 1899 }
1900 /* 1900 /*
1901 * Check and set flags if the current (right) segment exists. 1901 * Check and set flags if the current (right) segment exists.
@@ -1905,7 +1905,7 @@ xfs_bmap_add_extent_hole_delay(
1905 idx < 1905 idx <
1906 ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) { 1906 ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) {
1907 xfs_bmbt_get_all(ep, &right); 1907 xfs_bmbt_get_all(ep, &right);
1908 STATE_SET(RIGHT_DELAY, ISNULLSTARTBLOCK(right.br_startblock)); 1908 STATE_SET(RIGHT_DELAY, isnullstartblock(right.br_startblock));
1909 } 1909 }
1910 /* 1910 /*
1911 * Set contiguity flags on the left and right neighbors. 1911 * Set contiguity flags on the left and right neighbors.
@@ -1938,12 +1938,12 @@ xfs_bmap_add_extent_hole_delay(
1938 XFS_BMAP_TRACE_PRE_UPDATE("LC|RC", ip, idx - 1, 1938 XFS_BMAP_TRACE_PRE_UPDATE("LC|RC", ip, idx - 1,
1939 XFS_DATA_FORK); 1939 XFS_DATA_FORK);
1940 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), temp); 1940 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), temp);
1941 oldlen = STARTBLOCKVAL(left.br_startblock) + 1941 oldlen = startblockval(left.br_startblock) +
1942 STARTBLOCKVAL(new->br_startblock) + 1942 startblockval(new->br_startblock) +
1943 STARTBLOCKVAL(right.br_startblock); 1943 startblockval(right.br_startblock);
1944 newlen = xfs_bmap_worst_indlen(ip, temp); 1944 newlen = xfs_bmap_worst_indlen(ip, temp);
1945 xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx - 1), 1945 xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx - 1),
1946 NULLSTARTBLOCK((int)newlen)); 1946 nullstartblock((int)newlen));
1947 XFS_BMAP_TRACE_POST_UPDATE("LC|RC", ip, idx - 1, 1947 XFS_BMAP_TRACE_POST_UPDATE("LC|RC", ip, idx - 1,
1948 XFS_DATA_FORK); 1948 XFS_DATA_FORK);
1949 XFS_BMAP_TRACE_DELETE("LC|RC", ip, idx, 1, XFS_DATA_FORK); 1949 XFS_BMAP_TRACE_DELETE("LC|RC", ip, idx, 1, XFS_DATA_FORK);
@@ -1964,11 +1964,11 @@ xfs_bmap_add_extent_hole_delay(
1964 XFS_BMAP_TRACE_PRE_UPDATE("LC", ip, idx - 1, 1964 XFS_BMAP_TRACE_PRE_UPDATE("LC", ip, idx - 1,
1965 XFS_DATA_FORK); 1965 XFS_DATA_FORK);
1966 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), temp); 1966 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), temp);
1967 oldlen = STARTBLOCKVAL(left.br_startblock) + 1967 oldlen = startblockval(left.br_startblock) +
1968 STARTBLOCKVAL(new->br_startblock); 1968 startblockval(new->br_startblock);
1969 newlen = xfs_bmap_worst_indlen(ip, temp); 1969 newlen = xfs_bmap_worst_indlen(ip, temp);
1970 xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx - 1), 1970 xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx - 1),
1971 NULLSTARTBLOCK((int)newlen)); 1971 nullstartblock((int)newlen));
1972 XFS_BMAP_TRACE_POST_UPDATE("LC", ip, idx - 1, 1972 XFS_BMAP_TRACE_POST_UPDATE("LC", ip, idx - 1,
1973 XFS_DATA_FORK); 1973 XFS_DATA_FORK);
1974 ip->i_df.if_lastex = idx - 1; 1974 ip->i_df.if_lastex = idx - 1;
@@ -1985,11 +1985,11 @@ xfs_bmap_add_extent_hole_delay(
1985 */ 1985 */
1986 XFS_BMAP_TRACE_PRE_UPDATE("RC", ip, idx, XFS_DATA_FORK); 1986 XFS_BMAP_TRACE_PRE_UPDATE("RC", ip, idx, XFS_DATA_FORK);
1987 temp = new->br_blockcount + right.br_blockcount; 1987 temp = new->br_blockcount + right.br_blockcount;
1988 oldlen = STARTBLOCKVAL(new->br_startblock) + 1988 oldlen = startblockval(new->br_startblock) +
1989 STARTBLOCKVAL(right.br_startblock); 1989 startblockval(right.br_startblock);
1990 newlen = xfs_bmap_worst_indlen(ip, temp); 1990 newlen = xfs_bmap_worst_indlen(ip, temp);
1991 xfs_bmbt_set_allf(ep, new->br_startoff, 1991 xfs_bmbt_set_allf(ep, new->br_startoff,
1992 NULLSTARTBLOCK((int)newlen), temp, right.br_state); 1992 nullstartblock((int)newlen), temp, right.br_state);
1993 XFS_BMAP_TRACE_POST_UPDATE("RC", ip, idx, XFS_DATA_FORK); 1993 XFS_BMAP_TRACE_POST_UPDATE("RC", ip, idx, XFS_DATA_FORK);
1994 ip->i_df.if_lastex = idx; 1994 ip->i_df.if_lastex = idx;
1995 /* DELTA: One in-core extent grew into a hole. */ 1995 /* DELTA: One in-core extent grew into a hole. */
@@ -2085,7 +2085,7 @@ xfs_bmap_add_extent_hole_real(
2085 */ 2085 */
2086 if (STATE_SET_TEST(LEFT_VALID, idx > 0)) { 2086 if (STATE_SET_TEST(LEFT_VALID, idx > 0)) {
2087 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &left); 2087 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &left);
2088 STATE_SET(LEFT_DELAY, ISNULLSTARTBLOCK(left.br_startblock)); 2088 STATE_SET(LEFT_DELAY, isnullstartblock(left.br_startblock));
2089 } 2089 }
2090 /* 2090 /*
2091 * Check and set flags if this segment has a current value. 2091 * Check and set flags if this segment has a current value.
@@ -2095,7 +2095,7 @@ xfs_bmap_add_extent_hole_real(
2095 idx < 2095 idx <
2096 ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) { 2096 ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) {
2097 xfs_bmbt_get_all(ep, &right); 2097 xfs_bmbt_get_all(ep, &right);
2098 STATE_SET(RIGHT_DELAY, ISNULLSTARTBLOCK(right.br_startblock)); 2098 STATE_SET(RIGHT_DELAY, isnullstartblock(right.br_startblock));
2099 } 2099 }
2100 /* 2100 /*
2101 * We're inserting a real allocation between "left" and "right". 2101 * We're inserting a real allocation between "left" and "right".
@@ -2143,7 +2143,7 @@ xfs_bmap_add_extent_hole_real(
2143 XFS_IFORK_NEXT_SET(ip, whichfork, 2143 XFS_IFORK_NEXT_SET(ip, whichfork,
2144 XFS_IFORK_NEXTENTS(ip, whichfork) - 1); 2144 XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
2145 if (cur == NULL) { 2145 if (cur == NULL) {
2146 rval = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork); 2146 rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
2147 } else { 2147 } else {
2148 rval = XFS_ILOG_CORE; 2148 rval = XFS_ILOG_CORE;
2149 if ((error = xfs_bmbt_lookup_eq(cur, 2149 if ((error = xfs_bmbt_lookup_eq(cur,
@@ -2185,7 +2185,7 @@ xfs_bmap_add_extent_hole_real(
2185 XFS_BMAP_TRACE_POST_UPDATE("LC", ip, idx - 1, whichfork); 2185 XFS_BMAP_TRACE_POST_UPDATE("LC", ip, idx - 1, whichfork);
2186 ifp->if_lastex = idx - 1; 2186 ifp->if_lastex = idx - 1;
2187 if (cur == NULL) { 2187 if (cur == NULL) {
2188 rval = XFS_ILOG_FEXT(whichfork); 2188 rval = xfs_ilog_fext(whichfork);
2189 } else { 2189 } else {
2190 rval = 0; 2190 rval = 0;
2191 if ((error = xfs_bmbt_lookup_eq(cur, 2191 if ((error = xfs_bmbt_lookup_eq(cur,
@@ -2220,7 +2220,7 @@ xfs_bmap_add_extent_hole_real(
2220 XFS_BMAP_TRACE_POST_UPDATE("RC", ip, idx, whichfork); 2220 XFS_BMAP_TRACE_POST_UPDATE("RC", ip, idx, whichfork);
2221 ifp->if_lastex = idx; 2221 ifp->if_lastex = idx;
2222 if (cur == NULL) { 2222 if (cur == NULL) {
2223 rval = XFS_ILOG_FEXT(whichfork); 2223 rval = xfs_ilog_fext(whichfork);
2224 } else { 2224 } else {
2225 rval = 0; 2225 rval = 0;
2226 if ((error = xfs_bmbt_lookup_eq(cur, 2226 if ((error = xfs_bmbt_lookup_eq(cur,
@@ -2254,7 +2254,7 @@ xfs_bmap_add_extent_hole_real(
2254 XFS_IFORK_NEXT_SET(ip, whichfork, 2254 XFS_IFORK_NEXT_SET(ip, whichfork,
2255 XFS_IFORK_NEXTENTS(ip, whichfork) + 1); 2255 XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
2256 if (cur == NULL) { 2256 if (cur == NULL) {
2257 rval = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork); 2257 rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
2258 } else { 2258 } else {
2259 rval = XFS_ILOG_CORE; 2259 rval = XFS_ILOG_CORE;
2260 if ((error = xfs_bmbt_lookup_eq(cur, 2260 if ((error = xfs_bmbt_lookup_eq(cur,
@@ -2482,7 +2482,7 @@ xfs_bmap_adjacent(
2482 * try to use it's last block as our starting point. 2482 * try to use it's last block as our starting point.
2483 */ 2483 */
2484 if (ap->eof && ap->prevp->br_startoff != NULLFILEOFF && 2484 if (ap->eof && ap->prevp->br_startoff != NULLFILEOFF &&
2485 !ISNULLSTARTBLOCK(ap->prevp->br_startblock) && 2485 !isnullstartblock(ap->prevp->br_startblock) &&
2486 ISVALID(ap->prevp->br_startblock + ap->prevp->br_blockcount, 2486 ISVALID(ap->prevp->br_startblock + ap->prevp->br_blockcount,
2487 ap->prevp->br_startblock)) { 2487 ap->prevp->br_startblock)) {
2488 ap->rval = ap->prevp->br_startblock + ap->prevp->br_blockcount; 2488 ap->rval = ap->prevp->br_startblock + ap->prevp->br_blockcount;
@@ -2511,7 +2511,7 @@ xfs_bmap_adjacent(
2511 * start block based on it. 2511 * start block based on it.
2512 */ 2512 */
2513 if (ap->prevp->br_startoff != NULLFILEOFF && 2513 if (ap->prevp->br_startoff != NULLFILEOFF &&
2514 !ISNULLSTARTBLOCK(ap->prevp->br_startblock) && 2514 !isnullstartblock(ap->prevp->br_startblock) &&
2515 (prevbno = ap->prevp->br_startblock + 2515 (prevbno = ap->prevp->br_startblock +
2516 ap->prevp->br_blockcount) && 2516 ap->prevp->br_blockcount) &&
2517 ISVALID(prevbno, ap->prevp->br_startblock)) { 2517 ISVALID(prevbno, ap->prevp->br_startblock)) {
@@ -2552,7 +2552,7 @@ xfs_bmap_adjacent(
2552 * If there's a following (right) block, select a requested 2552 * If there's a following (right) block, select a requested
2553 * start block based on it. 2553 * start block based on it.
2554 */ 2554 */
2555 if (!ISNULLSTARTBLOCK(ap->gotp->br_startblock)) { 2555 if (!isnullstartblock(ap->gotp->br_startblock)) {
2556 /* 2556 /*
2557 * Calculate gap to start of next block. 2557 * Calculate gap to start of next block.
2558 */ 2558 */
@@ -3082,7 +3082,7 @@ xfs_bmap_btree_to_extents(
3082 ASSERT(ifp->if_broot == NULL); 3082 ASSERT(ifp->if_broot == NULL);
3083 ASSERT((ifp->if_flags & XFS_IFBROOT) == 0); 3083 ASSERT((ifp->if_flags & XFS_IFBROOT) == 0);
3084 XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); 3084 XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
3085 *logflagsp = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork); 3085 *logflagsp = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
3086 return 0; 3086 return 0;
3087} 3087}
3088 3088
@@ -3136,8 +3136,8 @@ xfs_bmap_del_extent(
3136 del_endoff = del->br_startoff + del->br_blockcount; 3136 del_endoff = del->br_startoff + del->br_blockcount;
3137 got_endoff = got.br_startoff + got.br_blockcount; 3137 got_endoff = got.br_startoff + got.br_blockcount;
3138 ASSERT(got_endoff >= del_endoff); 3138 ASSERT(got_endoff >= del_endoff);
3139 delay = ISNULLSTARTBLOCK(got.br_startblock); 3139 delay = isnullstartblock(got.br_startblock);
3140 ASSERT(ISNULLSTARTBLOCK(del->br_startblock) == delay); 3140 ASSERT(isnullstartblock(del->br_startblock) == delay);
3141 flags = 0; 3141 flags = 0;
3142 qfield = 0; 3142 qfield = 0;
3143 error = 0; 3143 error = 0;
@@ -3189,7 +3189,7 @@ xfs_bmap_del_extent(
3189 } 3189 }
3190 da_old = da_new = 0; 3190 da_old = da_new = 0;
3191 } else { 3191 } else {
3192 da_old = STARTBLOCKVAL(got.br_startblock); 3192 da_old = startblockval(got.br_startblock);
3193 da_new = 0; 3193 da_new = 0;
3194 nblks = 0; 3194 nblks = 0;
3195 do_fx = 0; 3195 do_fx = 0;
@@ -3213,7 +3213,7 @@ xfs_bmap_del_extent(
3213 XFS_IFORK_NEXTENTS(ip, whichfork) - 1); 3213 XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
3214 flags |= XFS_ILOG_CORE; 3214 flags |= XFS_ILOG_CORE;
3215 if (!cur) { 3215 if (!cur) {
3216 flags |= XFS_ILOG_FEXT(whichfork); 3216 flags |= xfs_ilog_fext(whichfork);
3217 break; 3217 break;
3218 } 3218 }
3219 if ((error = xfs_btree_delete(cur, &i))) 3219 if ((error = xfs_btree_delete(cur, &i)))
@@ -3233,7 +3233,7 @@ xfs_bmap_del_extent(
3233 if (delay) { 3233 if (delay) {
3234 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), 3234 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
3235 da_old); 3235 da_old);
3236 xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); 3236 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
3237 XFS_BMAP_TRACE_POST_UPDATE("2", ip, idx, 3237 XFS_BMAP_TRACE_POST_UPDATE("2", ip, idx,
3238 whichfork); 3238 whichfork);
3239 da_new = temp; 3239 da_new = temp;
@@ -3242,7 +3242,7 @@ xfs_bmap_del_extent(
3242 xfs_bmbt_set_startblock(ep, del_endblock); 3242 xfs_bmbt_set_startblock(ep, del_endblock);
3243 XFS_BMAP_TRACE_POST_UPDATE("2", ip, idx, whichfork); 3243 XFS_BMAP_TRACE_POST_UPDATE("2", ip, idx, whichfork);
3244 if (!cur) { 3244 if (!cur) {
3245 flags |= XFS_ILOG_FEXT(whichfork); 3245 flags |= xfs_ilog_fext(whichfork);
3246 break; 3246 break;
3247 } 3247 }
3248 if ((error = xfs_bmbt_update(cur, del_endoff, del_endblock, 3248 if ((error = xfs_bmbt_update(cur, del_endoff, del_endblock,
@@ -3262,7 +3262,7 @@ xfs_bmap_del_extent(
3262 if (delay) { 3262 if (delay) {
3263 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), 3263 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
3264 da_old); 3264 da_old);
3265 xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); 3265 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
3266 XFS_BMAP_TRACE_POST_UPDATE("1", ip, idx, 3266 XFS_BMAP_TRACE_POST_UPDATE("1", ip, idx,
3267 whichfork); 3267 whichfork);
3268 da_new = temp; 3268 da_new = temp;
@@ -3270,7 +3270,7 @@ xfs_bmap_del_extent(
3270 } 3270 }
3271 XFS_BMAP_TRACE_POST_UPDATE("1", ip, idx, whichfork); 3271 XFS_BMAP_TRACE_POST_UPDATE("1", ip, idx, whichfork);
3272 if (!cur) { 3272 if (!cur) {
3273 flags |= XFS_ILOG_FEXT(whichfork); 3273 flags |= xfs_ilog_fext(whichfork);
3274 break; 3274 break;
3275 } 3275 }
3276 if ((error = xfs_bmbt_update(cur, got.br_startoff, 3276 if ((error = xfs_bmbt_update(cur, got.br_startoff,
@@ -3345,22 +3345,22 @@ xfs_bmap_del_extent(
3345 } 3345 }
3346 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 3346 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
3347 } else 3347 } else
3348 flags |= XFS_ILOG_FEXT(whichfork); 3348 flags |= xfs_ilog_fext(whichfork);
3349 XFS_IFORK_NEXT_SET(ip, whichfork, 3349 XFS_IFORK_NEXT_SET(ip, whichfork,
3350 XFS_IFORK_NEXTENTS(ip, whichfork) + 1); 3350 XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
3351 } else { 3351 } else {
3352 ASSERT(whichfork == XFS_DATA_FORK); 3352 ASSERT(whichfork == XFS_DATA_FORK);
3353 temp = xfs_bmap_worst_indlen(ip, temp); 3353 temp = xfs_bmap_worst_indlen(ip, temp);
3354 xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); 3354 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
3355 temp2 = xfs_bmap_worst_indlen(ip, temp2); 3355 temp2 = xfs_bmap_worst_indlen(ip, temp2);
3356 new.br_startblock = NULLSTARTBLOCK((int)temp2); 3356 new.br_startblock = nullstartblock((int)temp2);
3357 da_new = temp + temp2; 3357 da_new = temp + temp2;
3358 while (da_new > da_old) { 3358 while (da_new > da_old) {
3359 if (temp) { 3359 if (temp) {
3360 temp--; 3360 temp--;
3361 da_new--; 3361 da_new--;
3362 xfs_bmbt_set_startblock(ep, 3362 xfs_bmbt_set_startblock(ep,
3363 NULLSTARTBLOCK((int)temp)); 3363 nullstartblock((int)temp));
3364 } 3364 }
3365 if (da_new == da_old) 3365 if (da_new == da_old)
3366 break; 3366 break;
@@ -3368,7 +3368,7 @@ xfs_bmap_del_extent(
3368 temp2--; 3368 temp2--;
3369 da_new--; 3369 da_new--;
3370 new.br_startblock = 3370 new.br_startblock =
3371 NULLSTARTBLOCK((int)temp2); 3371 nullstartblock((int)temp2);
3372 } 3372 }
3373 } 3373 }
3374 } 3374 }
@@ -3545,7 +3545,7 @@ xfs_bmap_extents_to_btree(
3545 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 3545 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
3546 for (cnt = i = 0; i < nextents; i++) { 3546 for (cnt = i = 0; i < nextents; i++) {
3547 ep = xfs_iext_get_ext(ifp, i); 3547 ep = xfs_iext_get_ext(ifp, i);
3548 if (!ISNULLSTARTBLOCK(xfs_bmbt_get_startblock(ep))) { 3548 if (!isnullstartblock(xfs_bmbt_get_startblock(ep))) {
3549 arp->l0 = cpu_to_be64(ep->l0); 3549 arp->l0 = cpu_to_be64(ep->l0);
3550 arp->l1 = cpu_to_be64(ep->l1); 3550 arp->l1 = cpu_to_be64(ep->l1);
3551 arp++; cnt++; 3551 arp++; cnt++;
@@ -3572,7 +3572,7 @@ xfs_bmap_extents_to_btree(
3572 xfs_btree_log_recs(cur, abp, 1, be16_to_cpu(ablock->bb_numrecs)); 3572 xfs_btree_log_recs(cur, abp, 1, be16_to_cpu(ablock->bb_numrecs));
3573 ASSERT(*curp == NULL); 3573 ASSERT(*curp == NULL);
3574 *curp = cur; 3574 *curp = cur;
3575 *logflagsp = XFS_ILOG_CORE | XFS_ILOG_FBROOT(whichfork); 3575 *logflagsp = XFS_ILOG_CORE | xfs_ilog_fbroot(whichfork);
3576 return 0; 3576 return 0;
3577} 3577}
3578 3578
@@ -3676,7 +3676,7 @@ xfs_bmap_local_to_extents(
3676 ip->i_d.di_nblocks = 1; 3676 ip->i_d.di_nblocks = 1;
3677 XFS_TRANS_MOD_DQUOT_BYINO(args.mp, tp, ip, 3677 XFS_TRANS_MOD_DQUOT_BYINO(args.mp, tp, ip,
3678 XFS_TRANS_DQ_BCOUNT, 1L); 3678 XFS_TRANS_DQ_BCOUNT, 1L);
3679 flags |= XFS_ILOG_FEXT(whichfork); 3679 flags |= xfs_ilog_fext(whichfork);
3680 } else { 3680 } else {
3681 ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0); 3681 ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0);
3682 xfs_bmap_forkoff_reset(ip->i_mount, ip, whichfork); 3682 xfs_bmap_forkoff_reset(ip->i_mount, ip, whichfork);
@@ -4082,7 +4082,7 @@ xfs_bmap_add_attrfork(
4082 XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); 4082 XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
4083 ip->i_afp->if_flags = XFS_IFEXTENTS; 4083 ip->i_afp->if_flags = XFS_IFEXTENTS;
4084 logflags = 0; 4084 logflags = 0;
4085 XFS_BMAP_INIT(&flist, &firstblock); 4085 xfs_bmap_init(&flist, &firstblock);
4086 switch (ip->i_d.di_format) { 4086 switch (ip->i_d.di_format) {
4087 case XFS_DINODE_FMT_LOCAL: 4087 case XFS_DINODE_FMT_LOCAL:
4088 error = xfs_bmap_add_attrfork_local(tp, ip, &firstblock, &flist, 4088 error = xfs_bmap_add_attrfork_local(tp, ip, &firstblock, &flist,
@@ -4162,7 +4162,7 @@ xfs_bmap_add_free(
4162 ASSERT(bno != NULLFSBLOCK); 4162 ASSERT(bno != NULLFSBLOCK);
4163 ASSERT(len > 0); 4163 ASSERT(len > 0);
4164 ASSERT(len <= MAXEXTLEN); 4164 ASSERT(len <= MAXEXTLEN);
4165 ASSERT(!ISNULLSTARTBLOCK(bno)); 4165 ASSERT(!isnullstartblock(bno));
4166 agno = XFS_FSB_TO_AGNO(mp, bno); 4166 agno = XFS_FSB_TO_AGNO(mp, bno);
4167 agbno = XFS_FSB_TO_AGBNO(mp, bno); 4167 agbno = XFS_FSB_TO_AGBNO(mp, bno);
4168 ASSERT(agno < mp->m_sb.sb_agcount); 4168 ASSERT(agno < mp->m_sb.sb_agcount);
@@ -4909,7 +4909,7 @@ xfs_bmapi(
4909 got.br_startoff = end; 4909 got.br_startoff = end;
4910 inhole = eof || got.br_startoff > bno; 4910 inhole = eof || got.br_startoff > bno;
4911 wasdelay = wr && !inhole && !(flags & XFS_BMAPI_DELAY) && 4911 wasdelay = wr && !inhole && !(flags & XFS_BMAPI_DELAY) &&
4912 ISNULLSTARTBLOCK(got.br_startblock); 4912 isnullstartblock(got.br_startblock);
4913 /* 4913 /*
4914 * First, deal with the hole before the allocated space 4914 * First, deal with the hole before the allocated space
4915 * that we found, if any. 4915 * that we found, if any.
@@ -5028,7 +5028,7 @@ xfs_bmapi(
5028 } 5028 }
5029 5029
5030 ip->i_delayed_blks += alen; 5030 ip->i_delayed_blks += alen;
5031 abno = NULLSTARTBLOCK(indlen); 5031 abno = nullstartblock(indlen);
5032 } else { 5032 } else {
5033 /* 5033 /*
5034 * If first time, allocate and fill in 5034 * If first time, allocate and fill in
@@ -5144,8 +5144,8 @@ xfs_bmapi(
5144 aoff + alen); 5144 aoff + alen);
5145#ifdef DEBUG 5145#ifdef DEBUG
5146 if (flags & XFS_BMAPI_DELAY) { 5146 if (flags & XFS_BMAPI_DELAY) {
5147 ASSERT(ISNULLSTARTBLOCK(got.br_startblock)); 5147 ASSERT(isnullstartblock(got.br_startblock));
5148 ASSERT(STARTBLOCKVAL(got.br_startblock) > 0); 5148 ASSERT(startblockval(got.br_startblock) > 0);
5149 } 5149 }
5150 ASSERT(got.br_state == XFS_EXT_NORM || 5150 ASSERT(got.br_state == XFS_EXT_NORM ||
5151 got.br_state == XFS_EXT_UNWRITTEN); 5151 got.br_state == XFS_EXT_UNWRITTEN);
@@ -5179,7 +5179,7 @@ xfs_bmapi(
5179 ASSERT((bno >= obno) || (n == 0)); 5179 ASSERT((bno >= obno) || (n == 0));
5180 ASSERT(bno < end); 5180 ASSERT(bno < end);
5181 mval->br_startoff = bno; 5181 mval->br_startoff = bno;
5182 if (ISNULLSTARTBLOCK(got.br_startblock)) { 5182 if (isnullstartblock(got.br_startblock)) {
5183 ASSERT(!wr || (flags & XFS_BMAPI_DELAY)); 5183 ASSERT(!wr || (flags & XFS_BMAPI_DELAY));
5184 mval->br_startblock = DELAYSTARTBLOCK; 5184 mval->br_startblock = DELAYSTARTBLOCK;
5185 } else 5185 } else
@@ -5201,7 +5201,7 @@ xfs_bmapi(
5201 ASSERT(mval->br_blockcount <= len); 5201 ASSERT(mval->br_blockcount <= len);
5202 } else { 5202 } else {
5203 *mval = got; 5203 *mval = got;
5204 if (ISNULLSTARTBLOCK(mval->br_startblock)) { 5204 if (isnullstartblock(mval->br_startblock)) {
5205 ASSERT(!wr || (flags & XFS_BMAPI_DELAY)); 5205 ASSERT(!wr || (flags & XFS_BMAPI_DELAY));
5206 mval->br_startblock = DELAYSTARTBLOCK; 5206 mval->br_startblock = DELAYSTARTBLOCK;
5207 } 5207 }
@@ -5329,12 +5329,12 @@ error0:
5329 * Log everything. Do this after conversion, there's no point in 5329 * Log everything. Do this after conversion, there's no point in
5330 * logging the extent records if we've converted to btree format. 5330 * logging the extent records if we've converted to btree format.
5331 */ 5331 */
5332 if ((logflags & XFS_ILOG_FEXT(whichfork)) && 5332 if ((logflags & xfs_ilog_fext(whichfork)) &&
5333 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) 5333 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
5334 logflags &= ~XFS_ILOG_FEXT(whichfork); 5334 logflags &= ~xfs_ilog_fext(whichfork);
5335 else if ((logflags & XFS_ILOG_FBROOT(whichfork)) && 5335 else if ((logflags & xfs_ilog_fbroot(whichfork)) &&
5336 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) 5336 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)
5337 logflags &= ~XFS_ILOG_FBROOT(whichfork); 5337 logflags &= ~xfs_ilog_fbroot(whichfork);
5338 /* 5338 /*
5339 * Log whatever the flags say, even if error. Otherwise we might miss 5339 * Log whatever the flags say, even if error. Otherwise we might miss
5340 * detecting a case where the data is changed, there's an error, 5340 * detecting a case where the data is changed, there's an error,
@@ -5411,7 +5411,7 @@ xfs_bmapi_single(
5411 *fsb = NULLFSBLOCK; 5411 *fsb = NULLFSBLOCK;
5412 return 0; 5412 return 0;
5413 } 5413 }
5414 ASSERT(!ISNULLSTARTBLOCK(got.br_startblock)); 5414 ASSERT(!isnullstartblock(got.br_startblock));
5415 ASSERT(bno < got.br_startoff + got.br_blockcount); 5415 ASSERT(bno < got.br_startoff + got.br_blockcount);
5416 *fsb = got.br_startblock + (bno - got.br_startoff); 5416 *fsb = got.br_startblock + (bno - got.br_startoff);
5417 ifp->if_lastex = lastx; 5417 ifp->if_lastex = lastx;
@@ -5543,7 +5543,7 @@ xfs_bunmapi(
5543 */ 5543 */
5544 ASSERT(ep != NULL); 5544 ASSERT(ep != NULL);
5545 del = got; 5545 del = got;
5546 wasdel = ISNULLSTARTBLOCK(del.br_startblock); 5546 wasdel = isnullstartblock(del.br_startblock);
5547 if (got.br_startoff < start) { 5547 if (got.br_startoff < start) {
5548 del.br_startoff = start; 5548 del.br_startoff = start;
5549 del.br_blockcount -= start - got.br_startoff; 5549 del.br_blockcount -= start - got.br_startoff;
@@ -5638,7 +5638,7 @@ xfs_bunmapi(
5638 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, 5638 xfs_bmbt_get_all(xfs_iext_get_ext(ifp,
5639 lastx - 1), &prev); 5639 lastx - 1), &prev);
5640 ASSERT(prev.br_state == XFS_EXT_NORM); 5640 ASSERT(prev.br_state == XFS_EXT_NORM);
5641 ASSERT(!ISNULLSTARTBLOCK(prev.br_startblock)); 5641 ASSERT(!isnullstartblock(prev.br_startblock));
5642 ASSERT(del.br_startblock == 5642 ASSERT(del.br_startblock ==
5643 prev.br_startblock + prev.br_blockcount); 5643 prev.br_startblock + prev.br_blockcount);
5644 if (prev.br_startoff < start) { 5644 if (prev.br_startoff < start) {
@@ -5666,7 +5666,7 @@ xfs_bunmapi(
5666 } 5666 }
5667 } 5667 }
5668 if (wasdel) { 5668 if (wasdel) {
5669 ASSERT(STARTBLOCKVAL(del.br_startblock) > 0); 5669 ASSERT(startblockval(del.br_startblock) > 0);
5670 /* Update realtime/data freespace, unreserve quota */ 5670 /* Update realtime/data freespace, unreserve quota */
5671 if (isrt) { 5671 if (isrt) {
5672 xfs_filblks_t rtexts; 5672 xfs_filblks_t rtexts;
@@ -5782,12 +5782,12 @@ error0:
5782 * Log everything. Do this after conversion, there's no point in 5782 * Log everything. Do this after conversion, there's no point in
5783 * logging the extent records if we've converted to btree format. 5783 * logging the extent records if we've converted to btree format.
5784 */ 5784 */
5785 if ((logflags & XFS_ILOG_FEXT(whichfork)) && 5785 if ((logflags & xfs_ilog_fext(whichfork)) &&
5786 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) 5786 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
5787 logflags &= ~XFS_ILOG_FEXT(whichfork); 5787 logflags &= ~xfs_ilog_fext(whichfork);
5788 else if ((logflags & XFS_ILOG_FBROOT(whichfork)) && 5788 else if ((logflags & xfs_ilog_fbroot(whichfork)) &&
5789 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) 5789 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)
5790 logflags &= ~XFS_ILOG_FBROOT(whichfork); 5790 logflags &= ~xfs_ilog_fbroot(whichfork);
5791 /* 5791 /*
5792 * Log inode even in the error case, if the transaction 5792 * Log inode even in the error case, if the transaction
5793 * is dirty we'll need to shut down the filesystem. 5793 * is dirty we'll need to shut down the filesystem.
@@ -5838,7 +5838,7 @@ xfs_getbmapx_fix_eof_hole(
5838 if (startblock == DELAYSTARTBLOCK) 5838 if (startblock == DELAYSTARTBLOCK)
5839 out->bmv_block = -2; 5839 out->bmv_block = -2;
5840 else 5840 else
5841 out->bmv_block = XFS_FSB_TO_DB(ip, startblock); 5841 out->bmv_block = xfs_fsb_to_db(ip, startblock);
5842 fileblock = XFS_BB_TO_FSB(ip->i_mount, out->bmv_offset); 5842 fileblock = XFS_BB_TO_FSB(ip->i_mount, out->bmv_offset);
5843 ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); 5843 ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
5844 if (xfs_iext_bno_to_ext(ifp, fileblock, &lastx) && 5844 if (xfs_iext_bno_to_ext(ifp, fileblock, &lastx) &&
@@ -5979,7 +5979,7 @@ xfs_getbmap(
5979 if (nex > XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1) 5979 if (nex > XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1)
5980 nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1; 5980 nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1;
5981 5981
5982 bmapi_flags = XFS_BMAPI_AFLAG(whichfork) | 5982 bmapi_flags = xfs_bmapi_aflag(whichfork) |
5983 ((iflags & BMV_IF_PREALLOC) ? 0 : XFS_BMAPI_IGSTATE); 5983 ((iflags & BMV_IF_PREALLOC) ? 0 : XFS_BMAPI_IGSTATE);
5984 5984
5985 /* 5985 /*
@@ -6098,7 +6098,7 @@ xfs_bmap_isaeof(
6098 */ 6098 */
6099 *aeof = (off >= s.br_startoff && 6099 *aeof = (off >= s.br_startoff &&
6100 off < s.br_startoff + s.br_blockcount && 6100 off < s.br_startoff + s.br_blockcount &&
6101 ISNULLSTARTBLOCK(s.br_startblock)) || 6101 isnullstartblock(s.br_startblock)) ||
6102 off >= s.br_startoff + s.br_blockcount; 6102 off >= s.br_startoff + s.br_blockcount;
6103 return 0; 6103 return 0;
6104} 6104}
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 284571c05ed0..be2979d88d32 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -95,7 +95,6 @@ typedef struct xfs_bmap_free
95 /* need write cache flushing and no */ 95 /* need write cache flushing and no */
96 /* additional allocation alignments */ 96 /* additional allocation alignments */
97 97
98#define XFS_BMAPI_AFLAG(w) xfs_bmapi_aflag(w)
99static inline int xfs_bmapi_aflag(int w) 98static inline int xfs_bmapi_aflag(int w)
100{ 99{
101 return (w == XFS_ATTR_FORK ? XFS_BMAPI_ATTRFORK : 0); 100 return (w == XFS_ATTR_FORK ? XFS_BMAPI_ATTRFORK : 0);
@@ -107,7 +106,6 @@ static inline int xfs_bmapi_aflag(int w)
107#define DELAYSTARTBLOCK ((xfs_fsblock_t)-1LL) 106#define DELAYSTARTBLOCK ((xfs_fsblock_t)-1LL)
108#define HOLESTARTBLOCK ((xfs_fsblock_t)-2LL) 107#define HOLESTARTBLOCK ((xfs_fsblock_t)-2LL)
109 108
110#define XFS_BMAP_INIT(flp,fbp) xfs_bmap_init(flp,fbp)
111static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp) 109static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp)
112{ 110{
113 ((flp)->xbf_first = NULL, (flp)->xbf_count = 0, \ 111 ((flp)->xbf_first = NULL, (flp)->xbf_count = 0, \
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 8f1ec73725d3..0760d352586f 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -110,25 +110,25 @@ __xfs_bmbt_get_all(
110 110
111 ext_flag = (int)(l0 >> (64 - BMBT_EXNTFLAG_BITLEN)); 111 ext_flag = (int)(l0 >> (64 - BMBT_EXNTFLAG_BITLEN));
112 s->br_startoff = ((xfs_fileoff_t)l0 & 112 s->br_startoff = ((xfs_fileoff_t)l0 &
113 XFS_MASK64LO(64 - BMBT_EXNTFLAG_BITLEN)) >> 9; 113 xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
114#if XFS_BIG_BLKNOS 114#if XFS_BIG_BLKNOS
115 s->br_startblock = (((xfs_fsblock_t)l0 & XFS_MASK64LO(9)) << 43) | 115 s->br_startblock = (((xfs_fsblock_t)l0 & xfs_mask64lo(9)) << 43) |
116 (((xfs_fsblock_t)l1) >> 21); 116 (((xfs_fsblock_t)l1) >> 21);
117#else 117#else
118#ifdef DEBUG 118#ifdef DEBUG
119 { 119 {
120 xfs_dfsbno_t b; 120 xfs_dfsbno_t b;
121 121
122 b = (((xfs_dfsbno_t)l0 & XFS_MASK64LO(9)) << 43) | 122 b = (((xfs_dfsbno_t)l0 & xfs_mask64lo(9)) << 43) |
123 (((xfs_dfsbno_t)l1) >> 21); 123 (((xfs_dfsbno_t)l1) >> 21);
124 ASSERT((b >> 32) == 0 || ISNULLDSTARTBLOCK(b)); 124 ASSERT((b >> 32) == 0 || isnulldstartblock(b));
125 s->br_startblock = (xfs_fsblock_t)b; 125 s->br_startblock = (xfs_fsblock_t)b;
126 } 126 }
127#else /* !DEBUG */ 127#else /* !DEBUG */
128 s->br_startblock = (xfs_fsblock_t)(((xfs_dfsbno_t)l1) >> 21); 128 s->br_startblock = (xfs_fsblock_t)(((xfs_dfsbno_t)l1) >> 21);
129#endif /* DEBUG */ 129#endif /* DEBUG */
130#endif /* XFS_BIG_BLKNOS */ 130#endif /* XFS_BIG_BLKNOS */
131 s->br_blockcount = (xfs_filblks_t)(l1 & XFS_MASK64LO(21)); 131 s->br_blockcount = (xfs_filblks_t)(l1 & xfs_mask64lo(21));
132 /* This is xfs_extent_state() in-line */ 132 /* This is xfs_extent_state() in-line */
133 if (ext_flag) { 133 if (ext_flag) {
134 ASSERT(s->br_blockcount != 0); /* saved for DMIG */ 134 ASSERT(s->br_blockcount != 0); /* saved for DMIG */
@@ -153,7 +153,7 @@ xfs_filblks_t
153xfs_bmbt_get_blockcount( 153xfs_bmbt_get_blockcount(
154 xfs_bmbt_rec_host_t *r) 154 xfs_bmbt_rec_host_t *r)
155{ 155{
156 return (xfs_filblks_t)(r->l1 & XFS_MASK64LO(21)); 156 return (xfs_filblks_t)(r->l1 & xfs_mask64lo(21));
157} 157}
158 158
159/* 159/*
@@ -164,15 +164,15 @@ xfs_bmbt_get_startblock(
164 xfs_bmbt_rec_host_t *r) 164 xfs_bmbt_rec_host_t *r)
165{ 165{
166#if XFS_BIG_BLKNOS 166#if XFS_BIG_BLKNOS
167 return (((xfs_fsblock_t)r->l0 & XFS_MASK64LO(9)) << 43) | 167 return (((xfs_fsblock_t)r->l0 & xfs_mask64lo(9)) << 43) |
168 (((xfs_fsblock_t)r->l1) >> 21); 168 (((xfs_fsblock_t)r->l1) >> 21);
169#else 169#else
170#ifdef DEBUG 170#ifdef DEBUG
171 xfs_dfsbno_t b; 171 xfs_dfsbno_t b;
172 172
173 b = (((xfs_dfsbno_t)r->l0 & XFS_MASK64LO(9)) << 43) | 173 b = (((xfs_dfsbno_t)r->l0 & xfs_mask64lo(9)) << 43) |
174 (((xfs_dfsbno_t)r->l1) >> 21); 174 (((xfs_dfsbno_t)r->l1) >> 21);
175 ASSERT((b >> 32) == 0 || ISNULLDSTARTBLOCK(b)); 175 ASSERT((b >> 32) == 0 || isnulldstartblock(b));
176 return (xfs_fsblock_t)b; 176 return (xfs_fsblock_t)b;
177#else /* !DEBUG */ 177#else /* !DEBUG */
178 return (xfs_fsblock_t)(((xfs_dfsbno_t)r->l1) >> 21); 178 return (xfs_fsblock_t)(((xfs_dfsbno_t)r->l1) >> 21);
@@ -188,7 +188,7 @@ xfs_bmbt_get_startoff(
188 xfs_bmbt_rec_host_t *r) 188 xfs_bmbt_rec_host_t *r)
189{ 189{
190 return ((xfs_fileoff_t)r->l0 & 190 return ((xfs_fileoff_t)r->l0 &
191 XFS_MASK64LO(64 - BMBT_EXNTFLAG_BITLEN)) >> 9; 191 xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
192} 192}
193 193
194xfs_exntst_t 194xfs_exntst_t
@@ -219,7 +219,7 @@ xfs_filblks_t
219xfs_bmbt_disk_get_blockcount( 219xfs_bmbt_disk_get_blockcount(
220 xfs_bmbt_rec_t *r) 220 xfs_bmbt_rec_t *r)
221{ 221{
222 return (xfs_filblks_t)(be64_to_cpu(r->l1) & XFS_MASK64LO(21)); 222 return (xfs_filblks_t)(be64_to_cpu(r->l1) & xfs_mask64lo(21));
223} 223}
224 224
225/* 225/*
@@ -230,7 +230,7 @@ xfs_bmbt_disk_get_startoff(
230 xfs_bmbt_rec_t *r) 230 xfs_bmbt_rec_t *r)
231{ 231{
232 return ((xfs_fileoff_t)be64_to_cpu(r->l0) & 232 return ((xfs_fileoff_t)be64_to_cpu(r->l0) &
233 XFS_MASK64LO(64 - BMBT_EXNTFLAG_BITLEN)) >> 9; 233 xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
234} 234}
235 235
236 236
@@ -248,33 +248,33 @@ xfs_bmbt_set_allf(
248 int extent_flag = (state == XFS_EXT_NORM) ? 0 : 1; 248 int extent_flag = (state == XFS_EXT_NORM) ? 0 : 1;
249 249
250 ASSERT(state == XFS_EXT_NORM || state == XFS_EXT_UNWRITTEN); 250 ASSERT(state == XFS_EXT_NORM || state == XFS_EXT_UNWRITTEN);
251 ASSERT((startoff & XFS_MASK64HI(64-BMBT_STARTOFF_BITLEN)) == 0); 251 ASSERT((startoff & xfs_mask64hi(64-BMBT_STARTOFF_BITLEN)) == 0);
252 ASSERT((blockcount & XFS_MASK64HI(64-BMBT_BLOCKCOUNT_BITLEN)) == 0); 252 ASSERT((blockcount & xfs_mask64hi(64-BMBT_BLOCKCOUNT_BITLEN)) == 0);
253 253
254#if XFS_BIG_BLKNOS 254#if XFS_BIG_BLKNOS
255 ASSERT((startblock & XFS_MASK64HI(64-BMBT_STARTBLOCK_BITLEN)) == 0); 255 ASSERT((startblock & xfs_mask64hi(64-BMBT_STARTBLOCK_BITLEN)) == 0);
256 256
257 r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) | 257 r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
258 ((xfs_bmbt_rec_base_t)startoff << 9) | 258 ((xfs_bmbt_rec_base_t)startoff << 9) |
259 ((xfs_bmbt_rec_base_t)startblock >> 43); 259 ((xfs_bmbt_rec_base_t)startblock >> 43);
260 r->l1 = ((xfs_bmbt_rec_base_t)startblock << 21) | 260 r->l1 = ((xfs_bmbt_rec_base_t)startblock << 21) |
261 ((xfs_bmbt_rec_base_t)blockcount & 261 ((xfs_bmbt_rec_base_t)blockcount &
262 (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)); 262 (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
263#else /* !XFS_BIG_BLKNOS */ 263#else /* !XFS_BIG_BLKNOS */
264 if (ISNULLSTARTBLOCK(startblock)) { 264 if (isnullstartblock(startblock)) {
265 r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) | 265 r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
266 ((xfs_bmbt_rec_base_t)startoff << 9) | 266 ((xfs_bmbt_rec_base_t)startoff << 9) |
267 (xfs_bmbt_rec_base_t)XFS_MASK64LO(9); 267 (xfs_bmbt_rec_base_t)xfs_mask64lo(9);
268 r->l1 = XFS_MASK64HI(11) | 268 r->l1 = xfs_mask64hi(11) |
269 ((xfs_bmbt_rec_base_t)startblock << 21) | 269 ((xfs_bmbt_rec_base_t)startblock << 21) |
270 ((xfs_bmbt_rec_base_t)blockcount & 270 ((xfs_bmbt_rec_base_t)blockcount &
271 (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)); 271 (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
272 } else { 272 } else {
273 r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) | 273 r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
274 ((xfs_bmbt_rec_base_t)startoff << 9); 274 ((xfs_bmbt_rec_base_t)startoff << 9);
275 r->l1 = ((xfs_bmbt_rec_base_t)startblock << 21) | 275 r->l1 = ((xfs_bmbt_rec_base_t)startblock << 21) |
276 ((xfs_bmbt_rec_base_t)blockcount & 276 ((xfs_bmbt_rec_base_t)blockcount &
277 (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)); 277 (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
278 } 278 }
279#endif /* XFS_BIG_BLKNOS */ 279#endif /* XFS_BIG_BLKNOS */
280} 280}
@@ -306,11 +306,11 @@ xfs_bmbt_disk_set_allf(
306 int extent_flag = (state == XFS_EXT_NORM) ? 0 : 1; 306 int extent_flag = (state == XFS_EXT_NORM) ? 0 : 1;
307 307
308 ASSERT(state == XFS_EXT_NORM || state == XFS_EXT_UNWRITTEN); 308 ASSERT(state == XFS_EXT_NORM || state == XFS_EXT_UNWRITTEN);
309 ASSERT((startoff & XFS_MASK64HI(64-BMBT_STARTOFF_BITLEN)) == 0); 309 ASSERT((startoff & xfs_mask64hi(64-BMBT_STARTOFF_BITLEN)) == 0);
310 ASSERT((blockcount & XFS_MASK64HI(64-BMBT_BLOCKCOUNT_BITLEN)) == 0); 310 ASSERT((blockcount & xfs_mask64hi(64-BMBT_BLOCKCOUNT_BITLEN)) == 0);
311 311
312#if XFS_BIG_BLKNOS 312#if XFS_BIG_BLKNOS
313 ASSERT((startblock & XFS_MASK64HI(64-BMBT_STARTBLOCK_BITLEN)) == 0); 313 ASSERT((startblock & xfs_mask64hi(64-BMBT_STARTBLOCK_BITLEN)) == 0);
314 314
315 r->l0 = cpu_to_be64( 315 r->l0 = cpu_to_be64(
316 ((xfs_bmbt_rec_base_t)extent_flag << 63) | 316 ((xfs_bmbt_rec_base_t)extent_flag << 63) |
@@ -319,17 +319,17 @@ xfs_bmbt_disk_set_allf(
319 r->l1 = cpu_to_be64( 319 r->l1 = cpu_to_be64(
320 ((xfs_bmbt_rec_base_t)startblock << 21) | 320 ((xfs_bmbt_rec_base_t)startblock << 21) |
321 ((xfs_bmbt_rec_base_t)blockcount & 321 ((xfs_bmbt_rec_base_t)blockcount &
322 (xfs_bmbt_rec_base_t)XFS_MASK64LO(21))); 322 (xfs_bmbt_rec_base_t)xfs_mask64lo(21)));
323#else /* !XFS_BIG_BLKNOS */ 323#else /* !XFS_BIG_BLKNOS */
324 if (ISNULLSTARTBLOCK(startblock)) { 324 if (isnullstartblock(startblock)) {
325 r->l0 = cpu_to_be64( 325 r->l0 = cpu_to_be64(
326 ((xfs_bmbt_rec_base_t)extent_flag << 63) | 326 ((xfs_bmbt_rec_base_t)extent_flag << 63) |
327 ((xfs_bmbt_rec_base_t)startoff << 9) | 327 ((xfs_bmbt_rec_base_t)startoff << 9) |
328 (xfs_bmbt_rec_base_t)XFS_MASK64LO(9)); 328 (xfs_bmbt_rec_base_t)xfs_mask64lo(9));
329 r->l1 = cpu_to_be64(XFS_MASK64HI(11) | 329 r->l1 = cpu_to_be64(xfs_mask64hi(11) |
330 ((xfs_bmbt_rec_base_t)startblock << 21) | 330 ((xfs_bmbt_rec_base_t)startblock << 21) |
331 ((xfs_bmbt_rec_base_t)blockcount & 331 ((xfs_bmbt_rec_base_t)blockcount &
332 (xfs_bmbt_rec_base_t)XFS_MASK64LO(21))); 332 (xfs_bmbt_rec_base_t)xfs_mask64lo(21)));
333 } else { 333 } else {
334 r->l0 = cpu_to_be64( 334 r->l0 = cpu_to_be64(
335 ((xfs_bmbt_rec_base_t)extent_flag << 63) | 335 ((xfs_bmbt_rec_base_t)extent_flag << 63) |
@@ -337,7 +337,7 @@ xfs_bmbt_disk_set_allf(
337 r->l1 = cpu_to_be64( 337 r->l1 = cpu_to_be64(
338 ((xfs_bmbt_rec_base_t)startblock << 21) | 338 ((xfs_bmbt_rec_base_t)startblock << 21) |
339 ((xfs_bmbt_rec_base_t)blockcount & 339 ((xfs_bmbt_rec_base_t)blockcount &
340 (xfs_bmbt_rec_base_t)XFS_MASK64LO(21))); 340 (xfs_bmbt_rec_base_t)xfs_mask64lo(21)));
341 } 341 }
342#endif /* XFS_BIG_BLKNOS */ 342#endif /* XFS_BIG_BLKNOS */
343} 343}
@@ -362,9 +362,9 @@ xfs_bmbt_set_blockcount(
362 xfs_bmbt_rec_host_t *r, 362 xfs_bmbt_rec_host_t *r,
363 xfs_filblks_t v) 363 xfs_filblks_t v)
364{ 364{
365 ASSERT((v & XFS_MASK64HI(43)) == 0); 365 ASSERT((v & xfs_mask64hi(43)) == 0);
366 r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)XFS_MASK64HI(43)) | 366 r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64hi(43)) |
367 (xfs_bmbt_rec_base_t)(v & XFS_MASK64LO(21)); 367 (xfs_bmbt_rec_base_t)(v & xfs_mask64lo(21));
368} 368}
369 369
370/* 370/*
@@ -376,21 +376,21 @@ xfs_bmbt_set_startblock(
376 xfs_fsblock_t v) 376 xfs_fsblock_t v)
377{ 377{
378#if XFS_BIG_BLKNOS 378#if XFS_BIG_BLKNOS
379 ASSERT((v & XFS_MASK64HI(12)) == 0); 379 ASSERT((v & xfs_mask64hi(12)) == 0);
380 r->l0 = (r->l0 & (xfs_bmbt_rec_base_t)XFS_MASK64HI(55)) | 380 r->l0 = (r->l0 & (xfs_bmbt_rec_base_t)xfs_mask64hi(55)) |
381 (xfs_bmbt_rec_base_t)(v >> 43); 381 (xfs_bmbt_rec_base_t)(v >> 43);
382 r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)) | 382 r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21)) |
383 (xfs_bmbt_rec_base_t)(v << 21); 383 (xfs_bmbt_rec_base_t)(v << 21);
384#else /* !XFS_BIG_BLKNOS */ 384#else /* !XFS_BIG_BLKNOS */
385 if (ISNULLSTARTBLOCK(v)) { 385 if (isnullstartblock(v)) {
386 r->l0 |= (xfs_bmbt_rec_base_t)XFS_MASK64LO(9); 386 r->l0 |= (xfs_bmbt_rec_base_t)xfs_mask64lo(9);
387 r->l1 = (xfs_bmbt_rec_base_t)XFS_MASK64HI(11) | 387 r->l1 = (xfs_bmbt_rec_base_t)xfs_mask64hi(11) |
388 ((xfs_bmbt_rec_base_t)v << 21) | 388 ((xfs_bmbt_rec_base_t)v << 21) |
389 (r->l1 & (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)); 389 (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
390 } else { 390 } else {
391 r->l0 &= ~(xfs_bmbt_rec_base_t)XFS_MASK64LO(9); 391 r->l0 &= ~(xfs_bmbt_rec_base_t)xfs_mask64lo(9);
392 r->l1 = ((xfs_bmbt_rec_base_t)v << 21) | 392 r->l1 = ((xfs_bmbt_rec_base_t)v << 21) |
393 (r->l1 & (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)); 393 (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
394 } 394 }
395#endif /* XFS_BIG_BLKNOS */ 395#endif /* XFS_BIG_BLKNOS */
396} 396}
@@ -403,10 +403,10 @@ xfs_bmbt_set_startoff(
403 xfs_bmbt_rec_host_t *r, 403 xfs_bmbt_rec_host_t *r,
404 xfs_fileoff_t v) 404 xfs_fileoff_t v)
405{ 405{
406 ASSERT((v & XFS_MASK64HI(9)) == 0); 406 ASSERT((v & xfs_mask64hi(9)) == 0);
407 r->l0 = (r->l0 & (xfs_bmbt_rec_base_t) XFS_MASK64HI(1)) | 407 r->l0 = (r->l0 & (xfs_bmbt_rec_base_t) xfs_mask64hi(1)) |
408 ((xfs_bmbt_rec_base_t)v << 9) | 408 ((xfs_bmbt_rec_base_t)v << 9) |
409 (r->l0 & (xfs_bmbt_rec_base_t)XFS_MASK64LO(9)); 409 (r->l0 & (xfs_bmbt_rec_base_t)xfs_mask64lo(9));
410} 410}
411 411
412/* 412/*
@@ -419,9 +419,9 @@ xfs_bmbt_set_state(
419{ 419{
420 ASSERT(v == XFS_EXT_NORM || v == XFS_EXT_UNWRITTEN); 420 ASSERT(v == XFS_EXT_NORM || v == XFS_EXT_UNWRITTEN);
421 if (v == XFS_EXT_NORM) 421 if (v == XFS_EXT_NORM)
422 r->l0 &= XFS_MASK64LO(64 - BMBT_EXNTFLAG_BITLEN); 422 r->l0 &= xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN);
423 else 423 else
424 r->l0 |= XFS_MASK64HI(BMBT_EXNTFLAG_BITLEN); 424 r->l0 |= xfs_mask64hi(BMBT_EXNTFLAG_BITLEN);
425} 425}
426 426
427/* 427/*
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
index a4555abb6622..0e8df007615e 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -76,26 +76,22 @@ typedef struct xfs_bmbt_rec_host {
76#define DSTARTBLOCKMASK \ 76#define DSTARTBLOCKMASK \
77 (((((xfs_dfsbno_t)1) << DSTARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS) 77 (((((xfs_dfsbno_t)1) << DSTARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS)
78 78
79#define ISNULLSTARTBLOCK(x) isnullstartblock(x)
80static inline int isnullstartblock(xfs_fsblock_t x) 79static inline int isnullstartblock(xfs_fsblock_t x)
81{ 80{
82 return ((x) & STARTBLOCKMASK) == STARTBLOCKMASK; 81 return ((x) & STARTBLOCKMASK) == STARTBLOCKMASK;
83} 82}
84 83
85#define ISNULLDSTARTBLOCK(x) isnulldstartblock(x)
86static inline int isnulldstartblock(xfs_dfsbno_t x) 84static inline int isnulldstartblock(xfs_dfsbno_t x)
87{ 85{
88 return ((x) & DSTARTBLOCKMASK) == DSTARTBLOCKMASK; 86 return ((x) & DSTARTBLOCKMASK) == DSTARTBLOCKMASK;
89} 87}
90 88
91#define NULLSTARTBLOCK(k) nullstartblock(k)
92static inline xfs_fsblock_t nullstartblock(int k) 89static inline xfs_fsblock_t nullstartblock(int k)
93{ 90{
94 ASSERT(k < (1 << STARTBLOCKVALBITS)); 91 ASSERT(k < (1 << STARTBLOCKVALBITS));
95 return STARTBLOCKMASK | (k); 92 return STARTBLOCKMASK | (k);
96} 93}
97 94
98#define STARTBLOCKVAL(x) startblockval(x)
99static inline xfs_filblks_t startblockval(xfs_fsblock_t x) 95static inline xfs_filblks_t startblockval(xfs_fsblock_t x)
100{ 96{
101 return (xfs_filblks_t)((x) & ~STARTBLOCKMASK); 97 return (xfs_filblks_t)((x) & ~STARTBLOCKMASK);
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 7ed59267420d..e73c332eb23f 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -730,8 +730,8 @@ xfs_btree_readahead_lblock(
730 struct xfs_btree_block *block) 730 struct xfs_btree_block *block)
731{ 731{
732 int rval = 0; 732 int rval = 0;
733 xfs_fsblock_t left = be64_to_cpu(block->bb_u.l.bb_leftsib); 733 xfs_dfsbno_t left = be64_to_cpu(block->bb_u.l.bb_leftsib);
734 xfs_fsblock_t right = be64_to_cpu(block->bb_u.l.bb_rightsib); 734 xfs_dfsbno_t right = be64_to_cpu(block->bb_u.l.bb_rightsib);
735 735
736 if ((lr & XFS_BTCUR_LEFTRA) && left != NULLDFSBNO) { 736 if ((lr & XFS_BTCUR_LEFTRA) && left != NULLDFSBNO) {
737 xfs_btree_reada_bufl(cur->bc_mp, left, 1); 737 xfs_btree_reada_bufl(cur->bc_mp, left, 1);
@@ -843,7 +843,7 @@ xfs_btree_ptr_is_null(
843 union xfs_btree_ptr *ptr) 843 union xfs_btree_ptr *ptr)
844{ 844{
845 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) 845 if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
846 return be64_to_cpu(ptr->l) == NULLFSBLOCK; 846 return be64_to_cpu(ptr->l) == NULLDFSBNO;
847 else 847 else
848 return be32_to_cpu(ptr->s) == NULLAGBLOCK; 848 return be32_to_cpu(ptr->s) == NULLAGBLOCK;
849} 849}
@@ -854,7 +854,7 @@ xfs_btree_set_ptr_null(
854 union xfs_btree_ptr *ptr) 854 union xfs_btree_ptr *ptr)
855{ 855{
856 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) 856 if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
857 ptr->l = cpu_to_be64(NULLFSBLOCK); 857 ptr->l = cpu_to_be64(NULLDFSBNO);
858 else 858 else
859 ptr->s = cpu_to_be32(NULLAGBLOCK); 859 ptr->s = cpu_to_be32(NULLAGBLOCK);
860} 860}
@@ -918,8 +918,8 @@ xfs_btree_init_block(
918 new->bb_numrecs = cpu_to_be16(numrecs); 918 new->bb_numrecs = cpu_to_be16(numrecs);
919 919
920 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { 920 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
921 new->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK); 921 new->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
922 new->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK); 922 new->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
923 } else { 923 } else {
924 new->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK); 924 new->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
925 new->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK); 925 new->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
@@ -960,7 +960,7 @@ xfs_btree_buf_to_ptr(
960 ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp, 960 ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp,
961 XFS_BUF_ADDR(bp))); 961 XFS_BUF_ADDR(bp)));
962 else { 962 else {
963 ptr->s = cpu_to_be32(XFS_DADDR_TO_AGBNO(cur->bc_mp, 963 ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp,
964 XFS_BUF_ADDR(bp))); 964 XFS_BUF_ADDR(bp)));
965 } 965 }
966} 966}
@@ -971,7 +971,7 @@ xfs_btree_ptr_to_daddr(
971 union xfs_btree_ptr *ptr) 971 union xfs_btree_ptr *ptr)
972{ 972{
973 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { 973 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
974 ASSERT(be64_to_cpu(ptr->l) != NULLFSBLOCK); 974 ASSERT(be64_to_cpu(ptr->l) != NULLDFSBNO);
975 975
976 return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l)); 976 return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l));
977 } else { 977 } else {
@@ -2454,7 +2454,7 @@ xfs_btree_new_iroot(
2454 xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs)); 2454 xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
2455 2455
2456 *logflags |= 2456 *logflags |=
2457 XFS_ILOG_CORE | XFS_ILOG_FBROOT(cur->bc_private.b.whichfork); 2457 XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork);
2458 *stat = 1; 2458 *stat = 1;
2459 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); 2459 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2460 return 0; 2460 return 0;
@@ -3048,7 +3048,7 @@ xfs_btree_kill_iroot(
3048 cur->bc_bufs[level - 1] = NULL; 3048 cur->bc_bufs[level - 1] = NULL;
3049 be16_add_cpu(&block->bb_level, -1); 3049 be16_add_cpu(&block->bb_level, -1);
3050 xfs_trans_log_inode(cur->bc_tp, ip, 3050 xfs_trans_log_inode(cur->bc_tp, ip,
3051 XFS_ILOG_CORE | XFS_ILOG_FBROOT(cur->bc_private.b.whichfork)); 3051 XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork));
3052 cur->bc_nlevels--; 3052 cur->bc_nlevels--;
3053out0: 3053out0:
3054 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); 3054 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index a11a8390bf6c..c45f74ff1a5b 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -1597,7 +1597,7 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno)
1597 nmap = 1; 1597 nmap = 1;
1598 ASSERT(args->firstblock != NULL); 1598 ASSERT(args->firstblock != NULL);
1599 if ((error = xfs_bmapi(tp, dp, bno, count, 1599 if ((error = xfs_bmapi(tp, dp, bno, count,
1600 XFS_BMAPI_AFLAG(w)|XFS_BMAPI_WRITE|XFS_BMAPI_METADATA| 1600 xfs_bmapi_aflag(w)|XFS_BMAPI_WRITE|XFS_BMAPI_METADATA|
1601 XFS_BMAPI_CONTIG, 1601 XFS_BMAPI_CONTIG,
1602 args->firstblock, args->total, &map, &nmap, 1602 args->firstblock, args->total, &map, &nmap,
1603 args->flist, NULL))) { 1603 args->flist, NULL))) {
@@ -1618,7 +1618,7 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno)
1618 nmap = MIN(XFS_BMAP_MAX_NMAP, count); 1618 nmap = MIN(XFS_BMAP_MAX_NMAP, count);
1619 c = (int)(bno + count - b); 1619 c = (int)(bno + count - b);
1620 if ((error = xfs_bmapi(tp, dp, b, c, 1620 if ((error = xfs_bmapi(tp, dp, b, c,
1621 XFS_BMAPI_AFLAG(w)|XFS_BMAPI_WRITE| 1621 xfs_bmapi_aflag(w)|XFS_BMAPI_WRITE|
1622 XFS_BMAPI_METADATA, 1622 XFS_BMAPI_METADATA,
1623 args->firstblock, args->total, 1623 args->firstblock, args->total,
1624 &mapp[mapi], &nmap, args->flist, 1624 &mapp[mapi], &nmap, args->flist,
@@ -1882,7 +1882,7 @@ xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
1882 * the last block to the place we want to kill. 1882 * the last block to the place we want to kill.
1883 */ 1883 */
1884 if ((error = xfs_bunmapi(tp, dp, dead_blkno, count, 1884 if ((error = xfs_bunmapi(tp, dp, dead_blkno, count,
1885 XFS_BMAPI_AFLAG(w)|XFS_BMAPI_METADATA, 1885 xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
1886 0, args->firstblock, args->flist, NULL, 1886 0, args->firstblock, args->flist, NULL,
1887 &done)) == ENOSPC) { 1887 &done)) == ENOSPC) {
1888 if (w != XFS_DATA_FORK) 1888 if (w != XFS_DATA_FORK)
@@ -1987,7 +1987,7 @@ xfs_da_do_buf(
1987 if ((error = xfs_bmapi(trans, dp, (xfs_fileoff_t)bno, 1987 if ((error = xfs_bmapi(trans, dp, (xfs_fileoff_t)bno,
1988 nfsb, 1988 nfsb,
1989 XFS_BMAPI_METADATA | 1989 XFS_BMAPI_METADATA |
1990 XFS_BMAPI_AFLAG(whichfork), 1990 xfs_bmapi_aflag(whichfork),
1991 NULL, 0, mapp, &nmap, NULL, NULL))) 1991 NULL, 0, mapp, &nmap, NULL, NULL)))
1992 goto exit0; 1992 goto exit0;
1993 } 1993 }
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index b4c1ee713492..f8278cfcc1d3 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -55,17 +55,11 @@ xfs_swapext(
55 struct file *file, *target_file; 55 struct file *file, *target_file;
56 int error = 0; 56 int error = 0;
57 57
58 sxp = kmem_alloc(sizeof(xfs_swapext_t), KM_MAYFAIL);
59 if (!sxp) {
60 error = XFS_ERROR(ENOMEM);
61 goto out;
62 }
63
64 /* Pull information for the target fd */ 58 /* Pull information for the target fd */
65 file = fget((int)sxp->sx_fdtarget); 59 file = fget((int)sxp->sx_fdtarget);
66 if (!file) { 60 if (!file) {
67 error = XFS_ERROR(EINVAL); 61 error = XFS_ERROR(EINVAL);
68 goto out_free_sxp; 62 goto out;
69 } 63 }
70 64
71 if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND)) { 65 if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND)) {
@@ -109,8 +103,6 @@ xfs_swapext(
109 fput(target_file); 103 fput(target_file);
110 out_put_file: 104 out_put_file:
111 fput(file); 105 fput(file);
112 out_free_sxp:
113 kmem_free(sxp);
114 out: 106 out:
115 return error; 107 return error;
116} 108}
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index e2fa0a1d8e96..e1f0a06aaf04 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -517,9 +517,9 @@ xfs_dir2_block_getdents(
517 /* 517 /*
518 * If it didn't fit, set the final offset to here & return. 518 * If it didn't fit, set the final offset to here & return.
519 */ 519 */
520 if (filldir(dirent, dep->name, dep->namelen, cook, 520 if (filldir(dirent, dep->name, dep->namelen, cook & 0x7fffffff,
521 ino, DT_UNKNOWN)) { 521 ino, DT_UNKNOWN)) {
522 *offset = cook; 522 *offset = cook & 0x7fffffff;
523 xfs_da_brelse(NULL, bp); 523 xfs_da_brelse(NULL, bp);
524 return 0; 524 return 0;
525 } 525 }
@@ -529,7 +529,8 @@ xfs_dir2_block_getdents(
529 * Reached the end of the block. 529 * Reached the end of the block.
530 * Set the offset to a non-existent block 1 and return. 530 * Set the offset to a non-existent block 1 and return.
531 */ 531 */
532 *offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0); 532 *offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) &
533 0x7fffffff;
533 xfs_da_brelse(NULL, bp); 534 xfs_da_brelse(NULL, bp);
534 return 0; 535 return 0;
535} 536}
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index 93535992cb60..ef805a374eec 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -1092,7 +1092,7 @@ xfs_dir2_leaf_getdents(
1092 * Won't fit. Return to caller. 1092 * Won't fit. Return to caller.
1093 */ 1093 */
1094 if (filldir(dirent, dep->name, dep->namelen, 1094 if (filldir(dirent, dep->name, dep->namelen,
1095 xfs_dir2_byte_to_dataptr(mp, curoff), 1095 xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff,
1096 ino, DT_UNKNOWN)) 1096 ino, DT_UNKNOWN))
1097 break; 1097 break;
1098 1098
@@ -1108,9 +1108,9 @@ xfs_dir2_leaf_getdents(
1108 * All done. Set output offset value to current offset. 1108 * All done. Set output offset value to current offset.
1109 */ 1109 */
1110 if (curoff > xfs_dir2_dataptr_to_byte(mp, XFS_DIR2_MAX_DATAPTR)) 1110 if (curoff > xfs_dir2_dataptr_to_byte(mp, XFS_DIR2_MAX_DATAPTR))
1111 *offset = XFS_DIR2_MAX_DATAPTR; 1111 *offset = XFS_DIR2_MAX_DATAPTR & 0x7fffffff;
1112 else 1112 else
1113 *offset = xfs_dir2_byte_to_dataptr(mp, curoff); 1113 *offset = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff;
1114 kmem_free(map); 1114 kmem_free(map);
1115 if (bp) 1115 if (bp)
1116 xfs_da_brelse(NULL, bp); 1116 xfs_da_brelse(NULL, bp);
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index b46af0013ec9..a8a8a6efad5b 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -752,8 +752,8 @@ xfs_dir2_sf_getdents(
752#if XFS_BIG_INUMS 752#if XFS_BIG_INUMS
753 ino += mp->m_inoadd; 753 ino += mp->m_inoadd;
754#endif 754#endif
755 if (filldir(dirent, ".", 1, dot_offset, ino, DT_DIR)) { 755 if (filldir(dirent, ".", 1, dot_offset & 0x7fffffff, ino, DT_DIR)) {
756 *offset = dot_offset; 756 *offset = dot_offset & 0x7fffffff;
757 return 0; 757 return 0;
758 } 758 }
759 } 759 }
@@ -766,8 +766,8 @@ xfs_dir2_sf_getdents(
766#if XFS_BIG_INUMS 766#if XFS_BIG_INUMS
767 ino += mp->m_inoadd; 767 ino += mp->m_inoadd;
768#endif 768#endif
769 if (filldir(dirent, "..", 2, dotdot_offset, ino, DT_DIR)) { 769 if (filldir(dirent, "..", 2, dotdot_offset & 0x7fffffff, ino, DT_DIR)) {
770 *offset = dotdot_offset; 770 *offset = dotdot_offset & 0x7fffffff;
771 return 0; 771 return 0;
772 } 772 }
773 } 773 }
@@ -791,14 +791,15 @@ xfs_dir2_sf_getdents(
791#endif 791#endif
792 792
793 if (filldir(dirent, sfep->name, sfep->namelen, 793 if (filldir(dirent, sfep->name, sfep->namelen,
794 off, ino, DT_UNKNOWN)) { 794 off & 0x7fffffff, ino, DT_UNKNOWN)) {
795 *offset = off; 795 *offset = off & 0x7fffffff;
796 return 0; 796 return 0;
797 } 797 }
798 sfep = xfs_dir2_sf_nextentry(sfp, sfep); 798 sfep = xfs_dir2_sf_nextentry(sfp, sfep);
799 } 799 }
800 800
801 *offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0); 801 *offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) &
802 0x7fffffff;
802 return 0; 803 return 0;
803} 804}
804 805
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index 589c41c38446..f7c06fac8229 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -465,8 +465,8 @@ typedef struct xfs_handle {
465#define XFS_IOC_ERROR_INJECTION _IOW ('X', 116, struct xfs_error_injection) 465#define XFS_IOC_ERROR_INJECTION _IOW ('X', 116, struct xfs_error_injection)
466#define XFS_IOC_ERROR_CLEARALL _IOW ('X', 117, struct xfs_error_injection) 466#define XFS_IOC_ERROR_CLEARALL _IOW ('X', 117, struct xfs_error_injection)
467/* XFS_IOC_ATTRCTL_BY_HANDLE -- deprecated 118 */ 467/* XFS_IOC_ATTRCTL_BY_HANDLE -- deprecated 118 */
468#define XFS_IOC_FREEZE _IOWR('X', 119, int) 468/* XFS_IOC_FREEZE -- FIFREEZE 119 */
469#define XFS_IOC_THAW _IOWR('X', 120, int) 469/* XFS_IOC_THAW -- FITHAW 120 */
470#define XFS_IOC_FSSETDM_BY_HANDLE _IOW ('X', 121, struct xfs_fsop_setdm_handlereq) 470#define XFS_IOC_FSSETDM_BY_HANDLE _IOW ('X', 121, struct xfs_fsop_setdm_handlereq)
471#define XFS_IOC_ATTRLIST_BY_HANDLE _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq) 471#define XFS_IOC_ATTRLIST_BY_HANDLE _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq)
472#define XFS_IOC_ATTRMULTI_BY_HANDLE _IOW ('X', 123, struct xfs_fsop_attrmulti_handlereq) 472#define XFS_IOC_ATTRMULTI_BY_HANDLE _IOW ('X', 123, struct xfs_fsop_attrmulti_handlereq)
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 852b6d32e8d0..680d0e0ec932 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -595,17 +595,19 @@ out:
595 return 0; 595 return 0;
596} 596}
597 597
598void 598int
599xfs_fs_log_dummy( 599xfs_fs_log_dummy(
600 xfs_mount_t *mp) 600 xfs_mount_t *mp)
601{ 601{
602 xfs_trans_t *tp; 602 xfs_trans_t *tp;
603 xfs_inode_t *ip; 603 xfs_inode_t *ip;
604 int error;
604 605
605 tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1); 606 tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
606 if (xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0)) { 607 error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
608 if (error) {
607 xfs_trans_cancel(tp, 0); 609 xfs_trans_cancel(tp, 0);
608 return; 610 return error;
609 } 611 }
610 612
611 ip = mp->m_rootip; 613 ip = mp->m_rootip;
@@ -615,9 +617,10 @@ xfs_fs_log_dummy(
615 xfs_trans_ihold(tp, ip); 617 xfs_trans_ihold(tp, ip);
616 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 618 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
617 xfs_trans_set_sync(tp); 619 xfs_trans_set_sync(tp);
618 xfs_trans_commit(tp, 0); 620 error = xfs_trans_commit(tp, 0);
619 621
620 xfs_iunlock(ip, XFS_ILOCK_EXCL); 622 xfs_iunlock(ip, XFS_ILOCK_EXCL);
623 return error;
621} 624}
622 625
623int 626int
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
index 300d0c9d61ad..88435e0a77c9 100644
--- a/fs/xfs/xfs_fsops.h
+++ b/fs/xfs/xfs_fsops.h
@@ -25,6 +25,6 @@ extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt);
25extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval, 25extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval,
26 xfs_fsop_resblks_t *outval); 26 xfs_fsop_resblks_t *outval);
27extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags); 27extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags);
28extern void xfs_fs_log_dummy(xfs_mount_t *mp); 28extern int xfs_fs_log_dummy(xfs_mount_t *mp);
29 29
30#endif /* __XFS_FSOPS_H__ */ 30#endif /* __XFS_FSOPS_H__ */
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index e6ebbaeb4dc6..ab016e5ae7be 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -357,7 +357,7 @@ xfs_ialloc_ag_alloc(
357 int ioffset = i << args.mp->m_sb.sb_inodelog; 357 int ioffset = i << args.mp->m_sb.sb_inodelog;
358 uint isize = sizeof(struct xfs_dinode); 358 uint isize = sizeof(struct xfs_dinode);
359 359
360 free = XFS_MAKE_IPTR(args.mp, fbuf, i); 360 free = xfs_make_iptr(args.mp, fbuf, i);
361 free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); 361 free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
362 free->di_version = version; 362 free->di_version = version;
363 free->di_gen = cpu_to_be32(gen); 363 free->di_gen = cpu_to_be32(gen);
@@ -937,7 +937,7 @@ nextag:
937 } 937 }
938 } 938 }
939 } 939 }
940 offset = XFS_IALLOC_FIND_FREE(&rec.ir_free); 940 offset = xfs_ialloc_find_free(&rec.ir_free);
941 ASSERT(offset >= 0); 941 ASSERT(offset >= 0);
942 ASSERT(offset < XFS_INODES_PER_CHUNK); 942 ASSERT(offset < XFS_INODES_PER_CHUNK);
943 ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % 943 ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
@@ -1279,7 +1279,7 @@ xfs_imap(
1279 offset = XFS_INO_TO_OFFSET(mp, ino); 1279 offset = XFS_INO_TO_OFFSET(mp, ino);
1280 ASSERT(offset < mp->m_sb.sb_inopblock); 1280 ASSERT(offset < mp->m_sb.sb_inopblock);
1281 1281
1282 cluster_agbno = XFS_DADDR_TO_AGBNO(mp, imap->im_blkno); 1282 cluster_agbno = xfs_daddr_to_agbno(mp, imap->im_blkno);
1283 offset += (agbno - cluster_agbno) * mp->m_sb.sb_inopblock; 1283 offset += (agbno - cluster_agbno) * mp->m_sb.sb_inopblock;
1284 1284
1285 imap->im_len = XFS_FSB_TO_BB(mp, blks_per_cluster); 1285 imap->im_len = XFS_FSB_TO_BB(mp, blks_per_cluster);
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h
index 50f558a4e0a8..aeee8278f92c 100644
--- a/fs/xfs/xfs_ialloc.h
+++ b/fs/xfs/xfs_ialloc.h
@@ -39,7 +39,6 @@ struct xfs_trans;
39/* 39/*
40 * Make an inode pointer out of the buffer/offset. 40 * Make an inode pointer out of the buffer/offset.
41 */ 41 */
42#define XFS_MAKE_IPTR(mp,b,o) xfs_make_iptr(mp,b,o)
43static inline struct xfs_dinode * 42static inline struct xfs_dinode *
44xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o) 43xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o)
45{ 44{
@@ -50,7 +49,6 @@ xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o)
50/* 49/*
51 * Find a free (set) bit in the inode bitmask. 50 * Find a free (set) bit in the inode bitmask.
52 */ 51 */
53#define XFS_IALLOC_FIND_FREE(fp) xfs_ialloc_find_free(fp)
54static inline int xfs_ialloc_find_free(xfs_inofree_t *fp) 52static inline int xfs_ialloc_find_free(xfs_inofree_t *fp)
55{ 53{
56 return xfs_lowbit64(*fp); 54 return xfs_lowbit64(*fp);
diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h
index 37e5dd01a577..5580e255ff06 100644
--- a/fs/xfs/xfs_ialloc_btree.h
+++ b/fs/xfs/xfs_ialloc_btree.h
@@ -36,7 +36,6 @@ typedef __uint64_t xfs_inofree_t;
36#define XFS_INODES_PER_CHUNK_LOG (XFS_NBBYLOG + 3) 36#define XFS_INODES_PER_CHUNK_LOG (XFS_NBBYLOG + 3)
37#define XFS_INOBT_ALL_FREE ((xfs_inofree_t)-1) 37#define XFS_INOBT_ALL_FREE ((xfs_inofree_t)-1)
38 38
39#define XFS_INOBT_MASKN(i,n) xfs_inobt_maskn(i,n)
40static inline xfs_inofree_t xfs_inobt_maskn(int i, int n) 39static inline xfs_inofree_t xfs_inobt_maskn(int i, int n)
41{ 40{
42 return (((n) >= XFS_INODES_PER_CHUNK ? \ 41 return (((n) >= XFS_INODES_PER_CHUNK ? \
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 5a5e035e5d38..e7ae08d1df48 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -424,6 +424,19 @@ xfs_iformat(
424 case XFS_DINODE_FMT_LOCAL: 424 case XFS_DINODE_FMT_LOCAL:
425 atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip); 425 atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip);
426 size = be16_to_cpu(atp->hdr.totsize); 426 size = be16_to_cpu(atp->hdr.totsize);
427
428 if (unlikely(size < sizeof(struct xfs_attr_sf_hdr))) {
429 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
430 "corrupt inode %Lu "
431 "(bad attr fork size %Ld).",
432 (unsigned long long) ip->i_ino,
433 (long long) size);
434 XFS_CORRUPTION_ERROR("xfs_iformat(8)",
435 XFS_ERRLEVEL_LOW,
436 ip->i_mount, dip);
437 return XFS_ERROR(EFSCORRUPTED);
438 }
439
427 error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size); 440 error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size);
428 break; 441 break;
429 case XFS_DINODE_FMT_EXTENTS: 442 case XFS_DINODE_FMT_EXTENTS:
@@ -1601,10 +1614,10 @@ xfs_itruncate_finish(
1601 * in this file with garbage in them once recovery 1614 * in this file with garbage in them once recovery
1602 * runs. 1615 * runs.
1603 */ 1616 */
1604 XFS_BMAP_INIT(&free_list, &first_block); 1617 xfs_bmap_init(&free_list, &first_block);
1605 error = xfs_bunmapi(ntp, ip, 1618 error = xfs_bunmapi(ntp, ip,
1606 first_unmap_block, unmap_len, 1619 first_unmap_block, unmap_len,
1607 XFS_BMAPI_AFLAG(fork) | 1620 xfs_bmapi_aflag(fork) |
1608 (sync ? 0 : XFS_BMAPI_ASYNC), 1621 (sync ? 0 : XFS_BMAPI_ASYNC),
1609 XFS_ITRUNC_MAX_EXTENTS, 1622 XFS_ITRUNC_MAX_EXTENTS,
1610 &first_block, &free_list, 1623 &first_block, &free_list,
@@ -2557,7 +2570,7 @@ xfs_iextents_copy(
2557 for (i = 0; i < nrecs; i++) { 2570 for (i = 0; i < nrecs; i++) {
2558 xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i); 2571 xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
2559 start_block = xfs_bmbt_get_startblock(ep); 2572 start_block = xfs_bmbt_get_startblock(ep);
2560 if (ISNULLSTARTBLOCK(start_block)) { 2573 if (isnullstartblock(start_block)) {
2561 /* 2574 /*
2562 * It's a delayed allocation extent, so skip it. 2575 * It's a delayed allocation extent, so skip it.
2563 */ 2576 */
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index 1ff04cc323ad..9957d0602d54 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -111,20 +111,16 @@ typedef struct xfs_inode_log_format_64 {
111 111
112#define XFS_ILI_IOLOCKED_ANY (XFS_ILI_IOLOCKED_EXCL | XFS_ILI_IOLOCKED_SHARED) 112#define XFS_ILI_IOLOCKED_ANY (XFS_ILI_IOLOCKED_EXCL | XFS_ILI_IOLOCKED_SHARED)
113 113
114
115#define XFS_ILOG_FBROOT(w) xfs_ilog_fbroot(w)
116static inline int xfs_ilog_fbroot(int w) 114static inline int xfs_ilog_fbroot(int w)
117{ 115{
118 return (w == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT); 116 return (w == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT);
119} 117}
120 118
121#define XFS_ILOG_FEXT(w) xfs_ilog_fext(w)
122static inline int xfs_ilog_fext(int w) 119static inline int xfs_ilog_fext(int w)
123{ 120{
124 return (w == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT); 121 return (w == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT);
125} 122}
126 123
127#define XFS_ILOG_FDATA(w) xfs_ilog_fdata(w)
128static inline int xfs_ilog_fdata(int w) 124static inline int xfs_ilog_fdata(int w)
129{ 125{
130 return (w == XFS_DATA_FORK ? XFS_ILOG_DDATA : XFS_ILOG_ADATA); 126 return (w == XFS_DATA_FORK ? XFS_ILOG_DDATA : XFS_ILOG_ADATA);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 911062cf73a6..08ce72316bfe 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -155,7 +155,7 @@ xfs_imap_to_bmap(
155 iomapp->iomap_bn = IOMAP_DADDR_NULL; 155 iomapp->iomap_bn = IOMAP_DADDR_NULL;
156 iomapp->iomap_flags |= IOMAP_DELAY; 156 iomapp->iomap_flags |= IOMAP_DELAY;
157 } else { 157 } else {
158 iomapp->iomap_bn = XFS_FSB_TO_DB(ip, start_block); 158 iomapp->iomap_bn = xfs_fsb_to_db(ip, start_block);
159 if (ISUNWRITTEN(imap)) 159 if (ISUNWRITTEN(imap))
160 iomapp->iomap_flags |= IOMAP_UNWRITTEN; 160 iomapp->iomap_flags |= IOMAP_UNWRITTEN;
161 } 161 }
@@ -261,7 +261,7 @@ xfs_iomap(
261 xfs_iunlock(ip, lockmode); 261 xfs_iunlock(ip, lockmode);
262 lockmode = 0; 262 lockmode = 0;
263 263
264 if (nimaps && !ISNULLSTARTBLOCK(imap.br_startblock)) { 264 if (nimaps && !isnullstartblock(imap.br_startblock)) {
265 xfs_iomap_map_trace(XFS_IOMAP_WRITE_MAP, ip, 265 xfs_iomap_map_trace(XFS_IOMAP_WRITE_MAP, ip,
266 offset, count, iomapp, &imap, flags); 266 offset, count, iomapp, &imap, flags);
267 break; 267 break;
@@ -491,7 +491,7 @@ xfs_iomap_write_direct(
491 /* 491 /*
492 * Issue the xfs_bmapi() call to allocate the blocks 492 * Issue the xfs_bmapi() call to allocate the blocks
493 */ 493 */
494 XFS_BMAP_INIT(&free_list, &firstfsb); 494 xfs_bmap_init(&free_list, &firstfsb);
495 nimaps = 1; 495 nimaps = 1;
496 error = xfs_bmapi(tp, ip, offset_fsb, count_fsb, bmapi_flag, 496 error = xfs_bmapi(tp, ip, offset_fsb, count_fsb, bmapi_flag,
497 &firstfsb, 0, &imap, &nimaps, &free_list, NULL); 497 &firstfsb, 0, &imap, &nimaps, &free_list, NULL);
@@ -751,7 +751,7 @@ xfs_iomap_write_allocate(
751 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 751 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
752 xfs_trans_ihold(tp, ip); 752 xfs_trans_ihold(tp, ip);
753 753
754 XFS_BMAP_INIT(&free_list, &first_block); 754 xfs_bmap_init(&free_list, &first_block);
755 755
756 /* 756 /*
757 * it is possible that the extents have changed since 757 * it is possible that the extents have changed since
@@ -911,7 +911,7 @@ xfs_iomap_write_unwritten(
911 /* 911 /*
912 * Modify the unwritten extent state of the buffer. 912 * Modify the unwritten extent state of the buffer.
913 */ 913 */
914 XFS_BMAP_INIT(&free_list, &firstfsb); 914 xfs_bmap_init(&free_list, &firstfsb);
915 nimaps = 1; 915 nimaps = 1;
916 error = xfs_bmapi(tp, ip, offset_fsb, count_fsb, 916 error = xfs_bmapi(tp, ip, offset_fsb, count_fsb,
917 XFS_BMAPI_WRITE|XFS_BMAPI_CONVERT, &firstfsb, 917 XFS_BMAPI_WRITE|XFS_BMAPI_CONVERT, &firstfsb,
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index e19d0a8d5618..cf98a805ec90 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -453,7 +453,7 @@ xfs_bulkstat(
453 (chunkidx = agino - gino + 1) < 453 (chunkidx = agino - gino + 1) <
454 XFS_INODES_PER_CHUNK && 454 XFS_INODES_PER_CHUNK &&
455 /* there are some left allocated */ 455 /* there are some left allocated */
456 XFS_INOBT_MASKN(chunkidx, 456 xfs_inobt_maskn(chunkidx,
457 XFS_INODES_PER_CHUNK - chunkidx) & ~gfree) { 457 XFS_INODES_PER_CHUNK - chunkidx) & ~gfree) {
458 /* 458 /*
459 * Grab the chunk record. Mark all the 459 * Grab the chunk record. Mark all the
@@ -464,7 +464,7 @@ xfs_bulkstat(
464 if (XFS_INOBT_MASK(i) & ~gfree) 464 if (XFS_INOBT_MASK(i) & ~gfree)
465 gcnt++; 465 gcnt++;
466 } 466 }
467 gfree |= XFS_INOBT_MASKN(0, chunkidx); 467 gfree |= xfs_inobt_maskn(0, chunkidx);
468 irbp->ir_startino = gino; 468 irbp->ir_startino = gino;
469 irbp->ir_freecount = gcnt; 469 irbp->ir_freecount = gcnt;
470 irbp->ir_free = gfree; 470 irbp->ir_free = gfree;
@@ -535,7 +535,7 @@ xfs_bulkstat(
535 chunkidx < XFS_INODES_PER_CHUNK; 535 chunkidx < XFS_INODES_PER_CHUNK;
536 chunkidx += nicluster, 536 chunkidx += nicluster,
537 agbno += nbcluster) { 537 agbno += nbcluster) {
538 if (XFS_INOBT_MASKN(chunkidx, 538 if (xfs_inobt_maskn(chunkidx,
539 nicluster) & ~gfree) 539 nicluster) & ~gfree)
540 xfs_btree_reada_bufs(mp, agno, 540 xfs_btree_reada_bufs(mp, agno,
541 agbno, nbcluster); 541 agbno, nbcluster);
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 35cca98bd94c..b1047de2fffd 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -70,16 +70,21 @@ STATIC void xlog_recover_check_summary(xlog_t *);
70xfs_buf_t * 70xfs_buf_t *
71xlog_get_bp( 71xlog_get_bp(
72 xlog_t *log, 72 xlog_t *log,
73 int num_bblks) 73 int nbblks)
74{ 74{
75 ASSERT(num_bblks > 0); 75 if (nbblks <= 0 || nbblks > log->l_logBBsize) {
76 xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks);
77 XFS_ERROR_REPORT("xlog_get_bp(1)",
78 XFS_ERRLEVEL_HIGH, log->l_mp);
79 return NULL;
80 }
76 81
77 if (log->l_sectbb_log) { 82 if (log->l_sectbb_log) {
78 if (num_bblks > 1) 83 if (nbblks > 1)
79 num_bblks += XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1); 84 nbblks += XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1);
80 num_bblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, num_bblks); 85 nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
81 } 86 }
82 return xfs_buf_get_noaddr(BBTOB(num_bblks), log->l_mp->m_logdev_targp); 87 return xfs_buf_get_noaddr(BBTOB(nbblks), log->l_mp->m_logdev_targp);
83} 88}
84 89
85void 90void
@@ -102,6 +107,13 @@ xlog_bread(
102{ 107{
103 int error; 108 int error;
104 109
110 if (nbblks <= 0 || nbblks > log->l_logBBsize) {
111 xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks);
112 XFS_ERROR_REPORT("xlog_bread(1)",
113 XFS_ERRLEVEL_HIGH, log->l_mp);
114 return EFSCORRUPTED;
115 }
116
105 if (log->l_sectbb_log) { 117 if (log->l_sectbb_log) {
106 blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no); 118 blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no);
107 nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks); 119 nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
@@ -139,6 +151,13 @@ xlog_bwrite(
139{ 151{
140 int error; 152 int error;
141 153
154 if (nbblks <= 0 || nbblks > log->l_logBBsize) {
155 xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks);
156 XFS_ERROR_REPORT("xlog_bwrite(1)",
157 XFS_ERRLEVEL_HIGH, log->l_mp);
158 return EFSCORRUPTED;
159 }
160
142 if (log->l_sectbb_log) { 161 if (log->l_sectbb_log) {
143 blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no); 162 blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no);
144 nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks); 163 nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 3c97c6463a4e..35300250e86d 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -45,7 +45,6 @@
45#include "xfs_fsops.h" 45#include "xfs_fsops.h"
46#include "xfs_utils.h" 46#include "xfs_utils.h"
47 47
48STATIC int xfs_mount_log_sb(xfs_mount_t *, __int64_t);
49STATIC int xfs_uuid_mount(xfs_mount_t *); 48STATIC int xfs_uuid_mount(xfs_mount_t *);
50STATIC void xfs_unmountfs_wait(xfs_mount_t *); 49STATIC void xfs_unmountfs_wait(xfs_mount_t *);
51 50
@@ -682,7 +681,7 @@ xfs_initialize_perag_data(xfs_mount_t *mp, xfs_agnumber_t agcount)
682 * Update alignment values based on mount options and sb values 681 * Update alignment values based on mount options and sb values
683 */ 682 */
684STATIC int 683STATIC int
685xfs_update_alignment(xfs_mount_t *mp, __uint64_t *update_flags) 684xfs_update_alignment(xfs_mount_t *mp)
686{ 685{
687 xfs_sb_t *sbp = &(mp->m_sb); 686 xfs_sb_t *sbp = &(mp->m_sb);
688 687
@@ -736,11 +735,11 @@ xfs_update_alignment(xfs_mount_t *mp, __uint64_t *update_flags)
736 if (xfs_sb_version_hasdalign(sbp)) { 735 if (xfs_sb_version_hasdalign(sbp)) {
737 if (sbp->sb_unit != mp->m_dalign) { 736 if (sbp->sb_unit != mp->m_dalign) {
738 sbp->sb_unit = mp->m_dalign; 737 sbp->sb_unit = mp->m_dalign;
739 *update_flags |= XFS_SB_UNIT; 738 mp->m_update_flags |= XFS_SB_UNIT;
740 } 739 }
741 if (sbp->sb_width != mp->m_swidth) { 740 if (sbp->sb_width != mp->m_swidth) {
742 sbp->sb_width = mp->m_swidth; 741 sbp->sb_width = mp->m_swidth;
743 *update_flags |= XFS_SB_WIDTH; 742 mp->m_update_flags |= XFS_SB_WIDTH;
744 } 743 }
745 } 744 }
746 } else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN && 745 } else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN &&
@@ -905,7 +904,6 @@ xfs_mountfs(
905 xfs_sb_t *sbp = &(mp->m_sb); 904 xfs_sb_t *sbp = &(mp->m_sb);
906 xfs_inode_t *rip; 905 xfs_inode_t *rip;
907 __uint64_t resblks; 906 __uint64_t resblks;
908 __int64_t update_flags = 0LL;
909 uint quotamount, quotaflags; 907 uint quotamount, quotaflags;
910 int uuid_mounted = 0; 908 int uuid_mounted = 0;
911 int error = 0; 909 int error = 0;
@@ -933,7 +931,7 @@ xfs_mountfs(
933 "XFS: correcting sb_features alignment problem"); 931 "XFS: correcting sb_features alignment problem");
934 sbp->sb_features2 |= sbp->sb_bad_features2; 932 sbp->sb_features2 |= sbp->sb_bad_features2;
935 sbp->sb_bad_features2 = sbp->sb_features2; 933 sbp->sb_bad_features2 = sbp->sb_features2;
936 update_flags |= XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2; 934 mp->m_update_flags |= XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2;
937 935
938 /* 936 /*
939 * Re-check for ATTR2 in case it was found in bad_features2 937 * Re-check for ATTR2 in case it was found in bad_features2
@@ -947,11 +945,11 @@ xfs_mountfs(
947 if (xfs_sb_version_hasattr2(&mp->m_sb) && 945 if (xfs_sb_version_hasattr2(&mp->m_sb) &&
948 (mp->m_flags & XFS_MOUNT_NOATTR2)) { 946 (mp->m_flags & XFS_MOUNT_NOATTR2)) {
949 xfs_sb_version_removeattr2(&mp->m_sb); 947 xfs_sb_version_removeattr2(&mp->m_sb);
950 update_flags |= XFS_SB_FEATURES2; 948 mp->m_update_flags |= XFS_SB_FEATURES2;
951 949
952 /* update sb_versionnum for the clearing of the morebits */ 950 /* update sb_versionnum for the clearing of the morebits */
953 if (!sbp->sb_features2) 951 if (!sbp->sb_features2)
954 update_flags |= XFS_SB_VERSIONNUM; 952 mp->m_update_flags |= XFS_SB_VERSIONNUM;
955 } 953 }
956 954
957 /* 955 /*
@@ -960,7 +958,7 @@ xfs_mountfs(
960 * allocator alignment is within an ag, therefore ag has 958 * allocator alignment is within an ag, therefore ag has
961 * to be aligned at stripe boundary. 959 * to be aligned at stripe boundary.
962 */ 960 */
963 error = xfs_update_alignment(mp, &update_flags); 961 error = xfs_update_alignment(mp);
964 if (error) 962 if (error)
965 goto error1; 963 goto error1;
966 964
@@ -1137,10 +1135,12 @@ xfs_mountfs(
1137 } 1135 }
1138 1136
1139 /* 1137 /*
1140 * If fs is not mounted readonly, then update the superblock changes. 1138 * If this is a read-only mount defer the superblock updates until
1139 * the next remount into writeable mode. Otherwise we would never
1140 * perform the update e.g. for the root filesystem.
1141 */ 1141 */
1142 if (update_flags && !(mp->m_flags & XFS_MOUNT_RDONLY)) { 1142 if (mp->m_update_flags && !(mp->m_flags & XFS_MOUNT_RDONLY)) {
1143 error = xfs_mount_log_sb(mp, update_flags); 1143 error = xfs_mount_log_sb(mp, mp->m_update_flags);
1144 if (error) { 1144 if (error) {
1145 cmn_err(CE_WARN, "XFS: failed to write sb changes"); 1145 cmn_err(CE_WARN, "XFS: failed to write sb changes");
1146 goto error4; 1146 goto error4;
@@ -1820,7 +1820,7 @@ xfs_uuid_mount(
1820 * be altered by the mount options, as well as any potential sb_features2 1820 * be altered by the mount options, as well as any potential sb_features2
1821 * fixup. Only the first superblock is updated. 1821 * fixup. Only the first superblock is updated.
1822 */ 1822 */
1823STATIC int 1823int
1824xfs_mount_log_sb( 1824xfs_mount_log_sb(
1825 xfs_mount_t *mp, 1825 xfs_mount_t *mp,
1826 __int64_t fields) 1826 __int64_t fields)
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index c1e028467327..f5e9937f9bdb 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -44,9 +44,9 @@ typedef struct xfs_trans_reservations {
44 44
45#ifndef __KERNEL__ 45#ifndef __KERNEL__
46 46
47#define XFS_DADDR_TO_AGNO(mp,d) \ 47#define xfs_daddr_to_agno(mp,d) \
48 ((xfs_agnumber_t)(XFS_BB_TO_FSBT(mp, d) / (mp)->m_sb.sb_agblocks)) 48 ((xfs_agnumber_t)(XFS_BB_TO_FSBT(mp, d) / (mp)->m_sb.sb_agblocks))
49#define XFS_DADDR_TO_AGBNO(mp,d) \ 49#define xfs_daddr_to_agbno(mp,d) \
50 ((xfs_agblock_t)(XFS_BB_TO_FSBT(mp, d) % (mp)->m_sb.sb_agblocks)) 50 ((xfs_agblock_t)(XFS_BB_TO_FSBT(mp, d) % (mp)->m_sb.sb_agblocks))
51 51
52#else /* __KERNEL__ */ 52#else /* __KERNEL__ */
@@ -327,6 +327,8 @@ typedef struct xfs_mount {
327 spinlock_t m_sync_lock; /* work item list lock */ 327 spinlock_t m_sync_lock; /* work item list lock */
328 int m_sync_seq; /* sync thread generation no. */ 328 int m_sync_seq; /* sync thread generation no. */
329 wait_queue_head_t m_wait_single_sync_task; 329 wait_queue_head_t m_wait_single_sync_task;
330 __int64_t m_update_flags; /* sb flags we need to update
331 on the next remount,rw */
330} xfs_mount_t; 332} xfs_mount_t;
331 333
332/* 334/*
@@ -439,7 +441,6 @@ void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname,
439 */ 441 */
440#define XFS_MFSI_QUIET 0x40 /* Be silent if mount errors found */ 442#define XFS_MFSI_QUIET 0x40 /* Be silent if mount errors found */
441 443
442#define XFS_DADDR_TO_AGNO(mp,d) xfs_daddr_to_agno(mp,d)
443static inline xfs_agnumber_t 444static inline xfs_agnumber_t
444xfs_daddr_to_agno(struct xfs_mount *mp, xfs_daddr_t d) 445xfs_daddr_to_agno(struct xfs_mount *mp, xfs_daddr_t d)
445{ 446{
@@ -448,7 +449,6 @@ xfs_daddr_to_agno(struct xfs_mount *mp, xfs_daddr_t d)
448 return (xfs_agnumber_t) ld; 449 return (xfs_agnumber_t) ld;
449} 450}
450 451
451#define XFS_DADDR_TO_AGBNO(mp,d) xfs_daddr_to_agbno(mp,d)
452static inline xfs_agblock_t 452static inline xfs_agblock_t
453xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d) 453xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
454{ 454{
@@ -514,6 +514,7 @@ extern int xfs_mod_incore_sb_unlocked(xfs_mount_t *, xfs_sb_field_t,
514 int64_t, int); 514 int64_t, int);
515extern int xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *, 515extern int xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *,
516 uint, int); 516 uint, int);
517extern int xfs_mount_log_sb(xfs_mount_t *, __int64_t);
517extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int); 518extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int);
518extern int xfs_readsb(xfs_mount_t *, int); 519extern int xfs_readsb(xfs_mount_t *, int);
519extern void xfs_freesb(xfs_mount_t *); 520extern void xfs_freesb(xfs_mount_t *);
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index 86471bb40fd4..58f85e9cd11d 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -147,7 +147,7 @@ xfs_rename(
147 xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, 147 xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip,
148 inodes, &num_inodes); 148 inodes, &num_inodes);
149 149
150 XFS_BMAP_INIT(&free_list, &first_block); 150 xfs_bmap_init(&free_list, &first_block);
151 tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME); 151 tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME);
152 cancel_flags = XFS_TRANS_RELEASE_LOG_RES; 152 cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
153 spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len); 153 spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len);
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index edf12c7b834c..c5bb86f3ec05 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -120,7 +120,7 @@ xfs_growfs_rt_alloc(
120 if ((error = xfs_trans_iget(mp, tp, ino, 0, 120 if ((error = xfs_trans_iget(mp, tp, ino, 0,
121 XFS_ILOCK_EXCL, &ip))) 121 XFS_ILOCK_EXCL, &ip)))
122 goto error_cancel; 122 goto error_cancel;
123 XFS_BMAP_INIT(&flist, &firstblock); 123 xfs_bmap_init(&flist, &firstblock);
124 /* 124 /*
125 * Allocate blocks to the bitmap file. 125 * Allocate blocks to the bitmap file.
126 */ 126 */
diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h
index f87db5344ce6..f76c003ec55d 100644
--- a/fs/xfs/xfs_rw.h
+++ b/fs/xfs/xfs_rw.h
@@ -28,7 +28,6 @@ struct xfs_mount;
28 * file is a real time file or not, because the bmap code 28 * file is a real time file or not, because the bmap code
29 * does. 29 * does.
30 */ 30 */
31#define XFS_FSB_TO_DB(ip,fsb) xfs_fsb_to_db(ip,fsb)
32static inline xfs_daddr_t 31static inline xfs_daddr_t
33xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb) 32xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
34{ 33{
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
index 1ed71916e4c9..1b017c657494 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -505,7 +505,7 @@ static inline void xfs_sb_version_removeattr2(xfs_sb_t *sbp)
505 505
506#define XFS_HDR_BLOCK(mp,d) ((xfs_agblock_t)XFS_BB_TO_FSBT(mp,d)) 506#define XFS_HDR_BLOCK(mp,d) ((xfs_agblock_t)XFS_BB_TO_FSBT(mp,d))
507#define XFS_DADDR_TO_FSB(mp,d) XFS_AGB_TO_FSB(mp, \ 507#define XFS_DADDR_TO_FSB(mp,d) XFS_AGB_TO_FSB(mp, \
508 XFS_DADDR_TO_AGNO(mp,d), XFS_DADDR_TO_AGBNO(mp,d)) 508 xfs_daddr_to_agno(mp,d), xfs_daddr_to_agbno(mp,d))
509#define XFS_FSB_TO_DADDR(mp,fsbno) XFS_AGB_TO_DADDR(mp, \ 509#define XFS_FSB_TO_DADDR(mp,fsbno) XFS_AGB_TO_DADDR(mp, \
510 XFS_FSB_TO_AGNO(mp,fsbno), XFS_FSB_TO_AGBNO(mp,fsbno)) 510 XFS_FSB_TO_AGNO(mp,fsbno), XFS_FSB_TO_AGBNO(mp,fsbno))
511 511
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index 0f5191644ab2..b2f724502f1b 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -45,7 +45,7 @@ typedef __uint32_t prid_t; /* project ID */
45typedef __uint32_t inst_t; /* an instruction */ 45typedef __uint32_t inst_t; /* an instruction */
46 46
47typedef __s64 xfs_off_t; /* <file offset> type */ 47typedef __s64 xfs_off_t; /* <file offset> type */
48typedef __u64 xfs_ino_t; /* <inode> type */ 48typedef unsigned long long xfs_ino_t; /* <inode> type */
49typedef __s64 xfs_daddr_t; /* <disk address> type */ 49typedef __s64 xfs_daddr_t; /* <disk address> type */
50typedef char * xfs_caddr_t; /* <core address> type */ 50typedef char * xfs_caddr_t; /* <core address> type */
51typedef __u32 xfs_dev_t; 51typedef __u32 xfs_dev_t;
@@ -111,8 +111,6 @@ typedef __uint64_t xfs_fileoff_t; /* block number in a file */
111typedef __int64_t xfs_sfiloff_t; /* signed block number in a file */ 111typedef __int64_t xfs_sfiloff_t; /* signed block number in a file */
112typedef __uint64_t xfs_filblks_t; /* number of blocks in a file */ 112typedef __uint64_t xfs_filblks_t; /* number of blocks in a file */
113 113
114typedef __uint8_t xfs_arch_t; /* architecture of an xfs fs */
115
116/* 114/*
117 * Null values for the types. 115 * Null values for the types.
118 */ 116 */
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index f07bf8768c3a..0e55c5d7db5f 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -862,7 +862,7 @@ xfs_inactive_symlink_rmt(
862 * Find the block(s) so we can inval and unmap them. 862 * Find the block(s) so we can inval and unmap them.
863 */ 863 */
864 done = 0; 864 done = 0;
865 XFS_BMAP_INIT(&free_list, &first_block); 865 xfs_bmap_init(&free_list, &first_block);
866 nmaps = ARRAY_SIZE(mval); 866 nmaps = ARRAY_SIZE(mval);
867 if ((error = xfs_bmapi(tp, ip, 0, XFS_B_TO_FSB(mp, size), 867 if ((error = xfs_bmapi(tp, ip, 0, XFS_B_TO_FSB(mp, size),
868 XFS_BMAPI_METADATA, &first_block, 0, mval, &nmaps, 868 XFS_BMAPI_METADATA, &first_block, 0, mval, &nmaps,
@@ -1288,7 +1288,7 @@ xfs_inactive(
1288 /* 1288 /*
1289 * Free the inode. 1289 * Free the inode.
1290 */ 1290 */
1291 XFS_BMAP_INIT(&free_list, &first_block); 1291 xfs_bmap_init(&free_list, &first_block);
1292 error = xfs_ifree(tp, ip, &free_list); 1292 error = xfs_ifree(tp, ip, &free_list);
1293 if (error) { 1293 if (error) {
1294 /* 1294 /*
@@ -1461,7 +1461,7 @@ xfs_create(
1461 xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); 1461 xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
1462 unlock_dp_on_error = B_TRUE; 1462 unlock_dp_on_error = B_TRUE;
1463 1463
1464 XFS_BMAP_INIT(&free_list, &first_block); 1464 xfs_bmap_init(&free_list, &first_block);
1465 1465
1466 ASSERT(ip == NULL); 1466 ASSERT(ip == NULL);
1467 1467
@@ -1879,7 +1879,7 @@ xfs_remove(
1879 } 1879 }
1880 } 1880 }
1881 1881
1882 XFS_BMAP_INIT(&free_list, &first_block); 1882 xfs_bmap_init(&free_list, &first_block);
1883 error = xfs_dir_removename(tp, dp, name, ip->i_ino, 1883 error = xfs_dir_removename(tp, dp, name, ip->i_ino,
1884 &first_block, &free_list, resblks); 1884 &first_block, &free_list, resblks);
1885 if (error) { 1885 if (error) {
@@ -2059,7 +2059,7 @@ xfs_link(
2059 if (error) 2059 if (error)
2060 goto error_return; 2060 goto error_return;
2061 2061
2062 XFS_BMAP_INIT(&free_list, &first_block); 2062 xfs_bmap_init(&free_list, &first_block);
2063 2063
2064 error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino, 2064 error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
2065 &first_block, &free_list, resblks); 2065 &first_block, &free_list, resblks);
@@ -2231,7 +2231,7 @@ xfs_mkdir(
2231 xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); 2231 xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2232 unlock_dp_on_error = B_FALSE; 2232 unlock_dp_on_error = B_FALSE;
2233 2233
2234 XFS_BMAP_INIT(&free_list, &first_block); 2234 xfs_bmap_init(&free_list, &first_block);
2235 2235
2236 error = xfs_dir_createname(tp, dp, dir_name, cdp->i_ino, 2236 error = xfs_dir_createname(tp, dp, dir_name, cdp->i_ino,
2237 &first_block, &free_list, resblks ? 2237 &first_block, &free_list, resblks ?
@@ -2438,7 +2438,7 @@ xfs_symlink(
2438 * Initialize the bmap freelist prior to calling either 2438 * Initialize the bmap freelist prior to calling either
2439 * bmapi or the directory create code. 2439 * bmapi or the directory create code.
2440 */ 2440 */
2441 XFS_BMAP_INIT(&free_list, &first_block); 2441 xfs_bmap_init(&free_list, &first_block);
2442 2442
2443 /* 2443 /*
2444 * Allocate an inode for the symlink. 2444 * Allocate an inode for the symlink.
@@ -2860,7 +2860,7 @@ retry:
2860 /* 2860 /*
2861 * Issue the xfs_bmapi() call to allocate the blocks 2861 * Issue the xfs_bmapi() call to allocate the blocks
2862 */ 2862 */
2863 XFS_BMAP_INIT(&free_list, &firstfsb); 2863 xfs_bmap_init(&free_list, &firstfsb);
2864 error = xfs_bmapi(tp, ip, startoffset_fsb, 2864 error = xfs_bmapi(tp, ip, startoffset_fsb,
2865 allocatesize_fsb, bmapi_flag, 2865 allocatesize_fsb, bmapi_flag,
2866 &firstfsb, 0, imapp, &nimaps, 2866 &firstfsb, 0, imapp, &nimaps,
@@ -2980,7 +2980,7 @@ xfs_zero_remaining_bytes(
2980 XFS_BUF_UNDONE(bp); 2980 XFS_BUF_UNDONE(bp);
2981 XFS_BUF_UNWRITE(bp); 2981 XFS_BUF_UNWRITE(bp);
2982 XFS_BUF_READ(bp); 2982 XFS_BUF_READ(bp);
2983 XFS_BUF_SET_ADDR(bp, XFS_FSB_TO_DB(ip, imap.br_startblock)); 2983 XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock));
2984 xfsbdstrat(mp, bp); 2984 xfsbdstrat(mp, bp);
2985 error = xfs_iowait(bp); 2985 error = xfs_iowait(bp);
2986 if (error) { 2986 if (error) {
@@ -3186,7 +3186,7 @@ xfs_free_file_space(
3186 /* 3186 /*
3187 * issue the bunmapi() call to free the blocks 3187 * issue the bunmapi() call to free the blocks
3188 */ 3188 */
3189 XFS_BMAP_INIT(&free_list, &firstfsb); 3189 xfs_bmap_init(&free_list, &firstfsb);
3190 error = xfs_bunmapi(tp, ip, startoffset_fsb, 3190 error = xfs_bunmapi(tp, ip, startoffset_fsb,
3191 endoffset_fsb - startoffset_fsb, 3191 endoffset_fsb - startoffset_fsb,
3192 0, 2, &firstfsb, &free_list, NULL, &done); 3192 0, 2, &firstfsb, &free_list, NULL, &done);